├── drain3 ├── py.typed ├── __init__.py ├── persistence_handler.py ├── memory_buffer_persistence.py ├── file_persistence.py ├── redis_persistence.py ├── kafka_persistence.py ├── masking.py ├── template_miner_config.py ├── simple_profiler.py ├── jaccard_drain.py ├── template_miner.py └── drain.py ├── deploy_new_ver.sh ├── .gitignore ├── tests ├── drain3_test.ini ├── test_masking.py ├── test_drain.py ├── test_jaccard_drain.py └── test_template_miner.py ├── LICENSE.txt ├── examples ├── drain3.ini ├── drain_bigfile_demo.py └── drain_stdin_demo.py ├── pyproject.toml ├── CONTRIBUTING.md ├── .github └── workflows │ └── CI.yaml ├── README.md └── poetry.lock /drain3/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deploy_new_ver.sh: -------------------------------------------------------------------------------- 1 | python3 setup.py sdist 2 | twine upload dist/* -------------------------------------------------------------------------------- /drain3/__init__.py: -------------------------------------------------------------------------------- 1 | from drain3.template_miner import TemplateMiner 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/* 2 | MANIFEST 3 | dist/* 4 | *venv/* 5 | .idea/* 6 | .vscode/* 7 | drain3.egg-info/* 8 | snapshot.txt 9 | examples/snapshot.txt 10 | *.bin 11 | *.log 12 | *.gz -------------------------------------------------------------------------------- /drain3/persistence_handler.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Optional 5 | 6 | 7 | class PersistenceHandler(ABC): 8 | 9 | @abstractmethod 10 | def save_state(self, state: bytes) -> None: 11 | pass 12 | 13 | @abstractmethod 14 | def load_state(self) -> Optional[bytes]: 15 | pass 16 | -------------------------------------------------------------------------------- /drain3/memory_buffer_persistence.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | from typing import Optional 4 | 5 | from drain3.persistence_handler import PersistenceHandler 6 | 7 | 8 | class MemoryBufferPersistence(PersistenceHandler): 9 | def __init__(self) -> None: 10 | self.state: Optional[bytes] = None 11 | 12 | def save_state(self, state: bytes) -> None: 13 | self.state = state 14 | 15 | def load_state(self) -> Optional[bytes]: 16 | return self.state -------------------------------------------------------------------------------- /drain3/file_persistence.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import os 4 | import pathlib 5 | from typing import Optional 6 | 7 | from drain3.persistence_handler import PersistenceHandler 8 | 9 | 10 | class FilePersistence(PersistenceHandler): 11 | def __init__(self, file_path: str) -> None: 12 | self.file_path = file_path 13 | 14 | def save_state(self, state: bytes) -> None: 15 | pathlib.Path(self.file_path).write_bytes(state) 16 | 17 | def load_state(self) -> Optional[bytes]: 18 | if not os.path.exists(self.file_path): 19 | return None 20 | 21 | return pathlib.Path(self.file_path).read_bytes() 22 | -------------------------------------------------------------------------------- /tests/drain3_test.ini: -------------------------------------------------------------------------------- 1 | [SNAPSHOT] 2 | snapshot_interval_minutes = 10 3 | compress_state = True 4 | 5 | [MASKING] 6 | masking = [ 7 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)", "mask_with": "ID"}, 8 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"}, 9 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, 10 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, 11 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)", "mask_with": "HEX"}, 12 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"}, 13 | {"regex_pattern":"(?<=executed cmd )(\".+?\")", "mask_with": "CMD"} 14 | ] 15 | 16 | [DRAIN] 17 | engine = Drain 18 | sim_th = 0.4 19 | depth = 4 20 | max_children = 100 21 | max_clusters = 1024 22 | extra_delimiters = ["_"] 23 | 24 | [PROFILING] 25 | enabled = True 26 | report_sec = 30 27 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-2022 International Business Machines 4 | and the Drain3 project contributors. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /drain3/redis_persistence.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | from typing import Optional, Union 4 | 5 | import redis 6 | 7 | from drain3.persistence_handler import PersistenceHandler 8 | 9 | 10 | class RedisPersistence(PersistenceHandler): 11 | def __init__(self, 12 | redis_host: str, 13 | redis_port: int, 14 | redis_db: int, 15 | redis_pass: Optional[str], 16 | is_ssl: bool, 17 | redis_key: Union[bytes, str, memoryview]) -> None: 18 | self.redis_host = redis_host 19 | self.redis_port = redis_port 20 | self.redis_db = redis_db 21 | self.redis_pass = redis_pass 22 | self.is_ssl = is_ssl 23 | self.redis_key = redis_key 24 | self.r = redis.Redis(host=self.redis_host, 25 | port=self.redis_port, 26 | db=self.redis_db, 27 | password=self.redis_pass, 28 | ssl=self.is_ssl) 29 | 30 | def save_state(self, state: bytes) -> None: 31 | self.r.set(self.redis_key, state) 32 | 33 | def load_state(self) -> Optional[bytes]: 34 | return self.r.get(self.redis_key) 35 | -------------------------------------------------------------------------------- /examples/drain3.ini: -------------------------------------------------------------------------------- 1 | [SNAPSHOT] 2 | snapshot_interval_minutes = 10 3 | compress_state = True 4 | 5 | [MASKING] 6 | masking = [ 7 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)", "mask_with": "ID"}, 8 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"}, 9 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, 10 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, 11 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)", "mask_with": "HEX"}, 12 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"}, 13 | {"regex_pattern":"(?<=executed cmd )(\".+?\")", "mask_with": "CMD"} 14 | ] 15 | mask_prefix = <: 16 | mask_suffix = :> 17 | 18 | [DRAIN] 19 | # engine is Optional parameter. Engine will be "Drain" if the engine argument is not specified. 20 | # engine has two options: 'Drain' and 'JaccardDrain'. 21 | # engine = Drain 22 | sim_th = 0.4 23 | depth = 4 24 | max_children = 100 25 | max_clusters = 1024 26 | extra_delimiters = ["_"] 27 | 28 | [PROFILING] 29 | enabled = True 30 | report_sec = 30 31 | -------------------------------------------------------------------------------- /tests/test_masking.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import unittest 4 | 5 | from drain3.masking import MaskingInstruction, LogMasker 6 | 7 | 8 | class MaskingTest(unittest.TestCase): 9 | 10 | def test_instructions_by_mask_name(self): 11 | instructions = [] 12 | a = MaskingInstruction(r"a", "1") 13 | instructions.append(a) 14 | b = MaskingInstruction(r"b", "1") 15 | instructions.append(b) 16 | c = MaskingInstruction(r"c", "2") 17 | instructions.append(c) 18 | d = MaskingInstruction(r"d", "3") 19 | instructions.append(d) 20 | x = MaskingInstruction(r"x", "something else") 21 | instructions.append(x) 22 | y = MaskingInstruction(r"y", "something else") 23 | instructions.append(y) 24 | masker = LogMasker(instructions, "", "") 25 | self.assertCountEqual(["1", "2", "3", "something else"], masker.mask_names) 26 | self.assertCountEqual([a, b], masker.instructions_by_mask_name("1")) 27 | self.assertCountEqual([c], masker.instructions_by_mask_name("2")) 28 | self.assertCountEqual([d], masker.instructions_by_mask_name("3")) 29 | self.assertCountEqual([x, y], masker.instructions_by_mask_name("something else")) 30 | 31 | def test_mask(self): 32 | s = "D9 test 999 888 1A ccc 3" 33 | mi = MaskingInstruction(r"((?<=[^A-Za-z0-9])|^)([\-\+]?\d+)((?=[^A-Za-z0-9])|$)", "NUM") 34 | masker = LogMasker([mi], "") 35 | masked = masker.mask(s) 36 | self.assertEqual("D9 test 1A ccc ", masked) 37 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "drain3" 3 | version = "0.9.11" 4 | description = "Persistent & streaming log template miner" 5 | authors = ["IBM Research Haifa "] 6 | maintainers = ["Yihao Chen(Superskyyy) "] 7 | readme = "README.md" 8 | license = "MIT" 9 | keywords=['drain', 'log', 'parser', 'IBM', 'template', 'logs', 'miner'] 10 | 11 | classifiers=[ 12 | "Programming Language :: Python :: 3.7", 13 | "Programming Language :: Python :: 3.8", 14 | "Programming Language :: Python :: 3.9", 15 | "Programming Language :: Python :: 3.10", 16 | "Programming Language :: Python :: 3.11", 17 | "License :: OSI Approved :: MIT License", 18 | "Operating System :: OS Independent", 19 | 'Topic :: System :: Monitoring', 20 | "Topic :: Software Development :: Libraries", 21 | ] 22 | 23 | packages = [ 24 | { include = "drain3" }, 25 | ] 26 | 27 | exclude = ['tests', 'examples'] 28 | 29 | [tool.poetry.build] 30 | generate-setup-file = true 31 | 32 | 33 | [tool.poetry.dependencies] 34 | python = "^3.7" 35 | jsonpickle = "*" 36 | cachetools = "*" 37 | redis = { version = "*", optional = true } 38 | kafka-python = { version = "*", optional = true } 39 | 40 | [tool.poetry.extras] 41 | all=[ 42 | 'redis', 43 | 'kafka-python', 44 | ] 45 | kafka= [ 46 | 'kafka-python', 47 | ] 48 | redis=[ 49 | 'redis', 50 | ] 51 | 52 | [tool.poetry.group.dev.dependencies] 53 | mypy = "*" 54 | types-cachetools = "*" 55 | types-redis = "*" 56 | 57 | [build-system] 58 | requires = ["poetry-core"] 59 | build-backend = "poetry.core.masonry.api" 60 | -------------------------------------------------------------------------------- /drain3/kafka_persistence.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | from typing import Any, cast, Optional 4 | 5 | import kafka # type: ignore[import] 6 | 7 | from drain3.persistence_handler import PersistenceHandler 8 | 9 | 10 | class KafkaPersistence(PersistenceHandler): 11 | 12 | def __init__(self, topic: str, snapshot_poll_timeout_sec: int = 60, **kafka_client_options: Any) -> None: 13 | self.topic = topic 14 | self.kafka_client_options = kafka_client_options 15 | self.producer = kafka.KafkaProducer(**self.kafka_client_options) 16 | self.snapshot_poll_timeout_sec = snapshot_poll_timeout_sec 17 | 18 | def save_state(self, state: bytes) -> None: 19 | self.producer.send(self.topic, value=state) 20 | 21 | def load_state(self) -> Optional[bytes]: 22 | consumer = kafka.KafkaConsumer(**self.kafka_client_options) 23 | partition = kafka.TopicPartition(self.topic, 0) 24 | consumer.assign([partition]) 25 | end_offsets = consumer.end_offsets([partition]) 26 | end_offset = list(end_offsets.values())[0] 27 | if end_offset > 0: 28 | consumer.seek(partition, end_offset - 1) 29 | snapshot_poll_timeout_ms = self.snapshot_poll_timeout_sec * 1000 30 | records = consumer.poll(snapshot_poll_timeout_ms) 31 | if not records: 32 | raise RuntimeError(f"No message received from Kafka during restore even though end_offset>0") 33 | last_msg = records[partition][0] 34 | state = cast(bytes, last_msg.value) 35 | else: 36 | state = None 37 | 38 | consumer.close() 39 | return state 40 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | All contributors must agree to the Developer Certificate of Origin Version 1.1. (DCO 1.1) by signing their commits with: 2 | 3 | ``` 4 | Signed-off-by: [NAME] <[EMAIL]> 5 | ``` 6 | 7 | This can be simply achieved with `git commit -s` when formatting your commit message. 8 | 9 | The full text of the DCO 1.1 is as follows: 10 | 11 | ``` 12 | Developer Certificate of Origin 13 | Version 1.1 14 | 15 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 16 | 660 York Street, Suite 102, 17 | San Francisco, CA 94110 USA 18 | 19 | Everyone is permitted to copy and distribute verbatim copies of this 20 | license document, but changing it is not allowed. 21 | 22 | 23 | Developer's Certificate of Origin 1.1 24 | 25 | By making a contribution to this project, I certify that: 26 | 27 | (a) The contribution was created in whole or in part by me and I 28 | have the right to submit it under the open source license 29 | indicated in the file; or 30 | 31 | (b) The contribution is based upon previous work that, to the best 32 | of my knowledge, is covered under an appropriate open source 33 | license and I have the right under that license to submit that 34 | work with modifications, whether created in whole or in part 35 | by me, under the same open source license (unless I am 36 | permitted to submit under a different license), as indicated 37 | in the file; or 38 | 39 | (c) The contribution was provided directly to me by some other 40 | person who certified (a), (b) or (c) and I have not modified 41 | it. 42 | 43 | (d) I understand and agree that this project and the contribution 44 | are public and that a record of the contribution (including all 45 | personal information I submit with it, including my sign-off) is 46 | maintained indefinitely and may be redistributed consistent with 47 | this project or the open source license(s) involved. 48 | ``` 49 | -------------------------------------------------------------------------------- /examples/drain_bigfile_demo.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import json 4 | import logging 5 | import os 6 | import subprocess 7 | import sys 8 | import time 9 | from os.path import dirname 10 | 11 | from drain3 import TemplateMiner 12 | from drain3.template_miner_config import TemplateMinerConfig 13 | 14 | logger = logging.getLogger(__name__) 15 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') 16 | 17 | in_gz_file = "SSH.tar.gz" 18 | in_log_file = "SSH.log" 19 | if not os.path.isfile(in_log_file): 20 | logger.info(f"Downloading file {in_gz_file}") 21 | p = subprocess.Popen(f"curl https://zenodo.org/record/3227177/files/{in_gz_file} --output {in_gz_file}", shell=True) 22 | p.wait() 23 | logger.info(f"Extracting file {in_gz_file}") 24 | p = subprocess.Popen(f"tar -xvzf {in_gz_file}", shell=True) 25 | p.wait() 26 | 27 | 28 | config = TemplateMinerConfig() 29 | config.load(f"{dirname(__file__)}/drain3.ini") 30 | config.profiling_enabled = True 31 | template_miner = TemplateMiner(config=config) 32 | 33 | line_count = 0 34 | 35 | with open(in_log_file) as f: 36 | lines = f.readlines() 37 | 38 | start_time = time.time() 39 | batch_start_time = start_time 40 | batch_size = 10000 41 | 42 | for line in lines: 43 | line = line.rstrip() 44 | line = line.partition(": ")[2] 45 | result = template_miner.add_log_message(line) 46 | line_count += 1 47 | if line_count % batch_size == 0: 48 | time_took = time.time() - batch_start_time 49 | rate = batch_size / time_took 50 | logger.info(f"Processing line: {line_count}, rate {rate:.1f} lines/sec, " 51 | f"{len(template_miner.drain.clusters)} clusters so far.") 52 | batch_start_time = time.time() 53 | if result["change_type"] != "none": 54 | result_json = json.dumps(result) 55 | logger.info(f"Input ({line_count}): {line}") 56 | logger.info(f"Result: {result_json}") 57 | 58 | time_took = time.time() - start_time 59 | rate = line_count / time_took 60 | logger.info(f"--- Done processing file in {time_took:.2f} sec. Total of {line_count} lines, rate {rate:.1f} lines/sec, " 61 | f"{len(template_miner.drain.clusters)} clusters") 62 | 63 | sorted_clusters = sorted(template_miner.drain.clusters, key=lambda it: it.size, reverse=True) 64 | for cluster in sorted_clusters: 65 | logger.info(cluster) 66 | 67 | print("Prefix Tree:") 68 | template_miner.drain.print_tree() 69 | 70 | template_miner.profiler.report(0) 71 | -------------------------------------------------------------------------------- /.github/workflows/CI.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - master 8 | schedule: 9 | - cron: '0 18 * * *' 10 | 11 | concurrency: 12 | group: CI-tests-${{ github.event.pull_request.number || github.ref }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | changes: 17 | # Check if any file related to Drain3/ CI behavior is changed 18 | # set outputs for other jobs to access for if conditions 19 | name: Check Changes 20 | runs-on: ubuntu-latest 21 | # To prevent error when there's no base branch 22 | if: github.event_name != 'schedule' 23 | timeout-minutes: 10 24 | outputs: 25 | drain3: ${{ steps.filter.outputs.drain3 }} 26 | steps: 27 | - uses: actions/checkout@v3 # required for push event 28 | - name: Check for file changes 29 | uses: getsentry/paths-filter@v2.11.1 30 | id: filter 31 | with: 32 | token: ${{ github.token }} 33 | # The following filters indicate a category along with 34 | # the files that should not be ignored by CI when modified. 35 | filters: | 36 | drain3: 37 | - '.github/**/*.yaml' 38 | - '**/*.py' 39 | - '**/Dockerfile*' 40 | - '**/Makefile' 41 | - 'tests/**' 42 | - '**/*.bat' 43 | - '**/*.sh' 44 | - '**/*.ps1' 45 | - '**/pyproject.toml' 46 | - '**/poetry.lock' 47 | - '**/*.cfg' 48 | - '**/*.ini' 49 | list-files: json # logs matched files 50 | build: 51 | runs-on: ubuntu-latest 52 | needs: [changes] 53 | if: | 54 | ( always() && ! cancelled() ) && 55 | ((github.event_name == 'schedule' && github.repository == 'logpai/drain3') || needs.changes.outputs.drain3 == 'true') 56 | 57 | strategy: 58 | matrix: 59 | python-version: [ "3.8", "3.9", "3.10", "3.11" ] 60 | fail-fast: false 61 | env: 62 | PYTHON_VERSION: ${{ matrix.python-version }} 63 | 64 | steps: 65 | - name: Check out Drain3 codebase 66 | uses: actions/checkout@v3 67 | 68 | - name: Set up Python ${{ matrix.python-version }} 69 | uses: actions/setup-python@v4 70 | with: 71 | python-version: ${{ matrix.python-version }} 72 | 73 | - name: Setup Poetry 74 | run: | 75 | python -m pip install --upgrade pip 76 | python -m pip install --upgrade poetry 77 | 78 | - name: Install dependencies 79 | run: poetry install 80 | 81 | - name: Test with unittest 82 | run: poetry run python -m unittest discover --verbose --start-directory tests 83 | -------------------------------------------------------------------------------- /examples/drain_stdin_demo.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import json 4 | import logging 5 | import sys 6 | from os.path import dirname 7 | 8 | from drain3 import TemplateMiner 9 | from drain3.template_miner_config import TemplateMinerConfig 10 | 11 | # persistence_type = "NONE" 12 | # persistence_type = "REDIS" 13 | # persistence_type = "KAFKA" 14 | persistence_type = "FILE" 15 | 16 | logger = logging.getLogger(__name__) 17 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') 18 | 19 | if persistence_type == "KAFKA": 20 | from drain3.kafka_persistence import KafkaPersistence 21 | 22 | persistence = KafkaPersistence("drain3_state", bootstrap_servers="localhost:9092") 23 | 24 | elif persistence_type == "FILE": 25 | from drain3.file_persistence import FilePersistence 26 | 27 | persistence = FilePersistence("drain3_state.bin") 28 | 29 | elif persistence_type == "REDIS": 30 | from drain3.redis_persistence import RedisPersistence 31 | 32 | persistence = RedisPersistence(redis_host='', 33 | redis_port=25061, 34 | redis_db=0, 35 | redis_pass='', 36 | is_ssl=True, 37 | redis_key="drain3_state_key") 38 | else: 39 | persistence = None 40 | 41 | config = TemplateMinerConfig() 42 | config.load(f"{dirname(__file__)}/drain3.ini") 43 | config.profiling_enabled = False 44 | 45 | template_miner = TemplateMiner(persistence, config) 46 | print(f"Drain3 started with '{persistence_type}' persistence") 47 | print(f"{len(config.masking_instructions)} masking instructions are in use") 48 | print(f"Starting training mode. Reading from std-in ('q' to finish)") 49 | while True: 50 | log_line = input("> ") 51 | if log_line == 'q': 52 | break 53 | result = template_miner.add_log_message(log_line) 54 | result_json = json.dumps(result) 55 | print(result_json) 56 | template = result["template_mined"] 57 | params = template_miner.extract_parameters(template, log_line) 58 | print(f"Parameters: {str(params)}") 59 | 60 | print("Training done. Mined clusters:") 61 | for cluster in template_miner.drain.clusters: 62 | print(cluster) 63 | 64 | print(f"Starting inference mode, matching to pre-trained clusters. Input log lines or 'q' to finish") 65 | while True: 66 | log_line = input("> ") 67 | if log_line == 'q': 68 | break 69 | cluster = template_miner.match(log_line) 70 | if cluster is None: 71 | print(f"No match found") 72 | else: 73 | template = cluster.get_template() 74 | print(f"Matched template #{cluster.cluster_id}: {template}") 75 | print(f"Parameters: {template_miner.get_parameter_list(template, log_line)}") 76 | -------------------------------------------------------------------------------- /drain3/masking.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import abc 4 | import re 5 | from typing import cast, Collection, Dict, List 6 | 7 | 8 | class AbstractMaskingInstruction(abc.ABC): 9 | 10 | def __init__(self, mask_with: str): 11 | self.mask_with = mask_with 12 | 13 | @abc.abstractmethod 14 | def mask(self, content: str, mask_prefix: str, mask_suffix: str) -> str: 15 | """ 16 | Mask content according to this instruction and return the result. 17 | 18 | :param content: text to apply masking to 19 | :param mask_prefix: the prefix of any masks inserted 20 | :param mask_suffix: the suffix of any masks inserted 21 | """ 22 | pass 23 | 24 | 25 | class MaskingInstruction(AbstractMaskingInstruction): 26 | 27 | def __init__(self, pattern: str, mask_with: str): 28 | super().__init__(mask_with) 29 | self.regex = re.compile(pattern) 30 | 31 | @property 32 | def pattern(self) -> str: 33 | return self.regex.pattern 34 | 35 | def mask(self, content: str, mask_prefix: str, mask_suffix: str) -> str: 36 | mask = mask_prefix + self.mask_with + mask_suffix 37 | return self.regex.sub(mask, content) 38 | 39 | 40 | # Alias for `MaskingInstruction`. 41 | RegexMaskingInstruction = MaskingInstruction 42 | 43 | 44 | class LogMasker: 45 | 46 | def __init__(self, masking_instructions: Collection[AbstractMaskingInstruction], 47 | mask_prefix: str, mask_suffix: str): 48 | self.mask_prefix = mask_prefix 49 | self.mask_suffix = mask_suffix 50 | self.masking_instructions = masking_instructions 51 | mask_name_to_instructions: Dict[str, List[AbstractMaskingInstruction]] = {} 52 | for mi in self.masking_instructions: 53 | mask_name_to_instructions.setdefault(mi.mask_with, []) 54 | mask_name_to_instructions[mi.mask_with].append(mi) 55 | self.mask_name_to_instructions = mask_name_to_instructions 56 | 57 | def mask(self, content: str) -> str: 58 | for mi in self.masking_instructions: 59 | content = mi.mask(content, self.mask_prefix, self.mask_suffix) 60 | return content 61 | 62 | @property 63 | def mask_names(self) -> Collection[str]: 64 | return self.mask_name_to_instructions.keys() 65 | 66 | def instructions_by_mask_name(self, mask_name: str) -> Collection[AbstractMaskingInstruction]: 67 | return cast(Collection[AbstractMaskingInstruction], self.mask_name_to_instructions.get(mask_name, [])) 68 | 69 | # Some masking examples 70 | # --------------------- 71 | # 72 | # masking_instances = [ 73 | # MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)', "ID"), 74 | # MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})((?=[^A-Za-z0-9])|$)', "IP"), 75 | # MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)', "SEQ"), 76 | # MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)', "SEQ"), 77 | # 78 | # MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)', "HEX"), 79 | # MaskingInstruction(r'((?<=[^A-Za-z0-9])|^)([\-\+]?\d+)((?=[^A-Za-z0-9])|$)', "NUM"), 80 | # MaskingInstruction(r'(?<=executed cmd )(".+?")', "CMD"), 81 | # ] 82 | -------------------------------------------------------------------------------- /drain3/template_miner_config.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import ast 4 | import configparser 5 | import json 6 | import logging 7 | from typing import Collection, Optional 8 | 9 | from drain3.masking import AbstractMaskingInstruction, MaskingInstruction 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class TemplateMinerConfig: 15 | def __init__(self) -> None: 16 | self.engine = "Drain" 17 | self.profiling_enabled = False 18 | self.profiling_report_sec = 60 19 | self.snapshot_interval_minutes = 5 20 | self.snapshot_compress_state = True 21 | self.drain_extra_delimiters: Collection[str] = [] 22 | self.drain_sim_th = 0.4 23 | self.drain_depth = 4 24 | self.drain_max_children = 100 25 | self.drain_max_clusters: Optional[int] = None 26 | self.masking_instructions: Collection[AbstractMaskingInstruction] = [] 27 | self.mask_prefix = "<" 28 | self.mask_suffix = ">" 29 | self.parameter_extraction_cache_capacity = 3000 30 | self.parametrize_numeric_tokens = True 31 | 32 | def load(self, config_filename: str) -> None: 33 | parser = configparser.ConfigParser() 34 | read_files = parser.read(config_filename) 35 | if len(read_files) == 0: 36 | logger.warning(f"config file not found: {config_filename}") 37 | 38 | section_profiling = 'PROFILING' 39 | section_snapshot = 'SNAPSHOT' 40 | section_drain = 'DRAIN' 41 | section_masking = 'MASKING' 42 | 43 | self.engine = parser.get(section_drain, 'engine', fallback=self.engine) 44 | 45 | self.profiling_enabled = parser.getboolean(section_profiling, 'enabled', 46 | fallback=self.profiling_enabled) 47 | self.profiling_report_sec = parser.getint(section_profiling, 'report_sec', 48 | fallback=self.profiling_report_sec) 49 | 50 | self.snapshot_interval_minutes = parser.getint(section_snapshot, 'snapshot_interval_minutes', 51 | fallback=self.snapshot_interval_minutes) 52 | self.snapshot_compress_state = parser.getboolean(section_snapshot, 'compress_state', 53 | fallback=self.snapshot_compress_state) 54 | 55 | drain_extra_delimiters_str = parser.get(section_drain, 'extra_delimiters', 56 | fallback=str(self.drain_extra_delimiters)) 57 | self.drain_extra_delimiters = ast.literal_eval(drain_extra_delimiters_str) 58 | 59 | self.drain_sim_th = parser.getfloat(section_drain, 'sim_th', 60 | fallback=self.drain_sim_th) 61 | self.drain_depth = parser.getint(section_drain, 'depth', 62 | fallback=self.drain_depth) 63 | self.drain_max_children = parser.getint(section_drain, 'max_children', 64 | fallback=self.drain_max_children) 65 | self.drain_max_clusters = parser.getint(section_drain, 'max_clusters', 66 | fallback=self.drain_max_clusters) 67 | self.parametrize_numeric_tokens = parser.getboolean(section_drain, 'parametrize_numeric_tokens', 68 | fallback=self.parametrize_numeric_tokens) 69 | 70 | masking_instructions_str = parser.get(section_masking, 'masking', 71 | fallback=str(self.masking_instructions)) 72 | self.mask_prefix = parser.get(section_masking, 'mask_prefix', fallback=self.mask_prefix) 73 | self.mask_suffix = parser.get(section_masking, 'mask_suffix', fallback=self.mask_suffix) 74 | self.parameter_extraction_cache_capacity = parser.getint(section_masking, 'parameter_extraction_cache_capacity', 75 | fallback=self.parameter_extraction_cache_capacity) 76 | 77 | masking_instructions = [] 78 | masking_list = json.loads(masking_instructions_str) 79 | for mi in masking_list: 80 | instruction = MaskingInstruction(mi['regex_pattern'], mi['mask_with']) 81 | masking_instructions.append(instruction) 82 | self.masking_instructions = masking_instructions 83 | -------------------------------------------------------------------------------- /drain3/simple_profiler.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # Based on https://github.com/davidohana/SimpleProfiler/blob/main/python/simple_profiler.py 3 | 4 | import os 5 | import time 6 | 7 | from abc import ABC, abstractmethod 8 | from typing import Any, Callable, MutableMapping, Union 9 | 10 | 11 | class Profiler(ABC): 12 | 13 | @abstractmethod 14 | def start_section(self, section_name: str) -> None: 15 | pass 16 | 17 | @abstractmethod 18 | def end_section(self, section_name: str = "") -> None: 19 | pass 20 | 21 | @abstractmethod 22 | def report(self, period_sec: int = 30) -> None: 23 | pass 24 | 25 | 26 | class NullProfiler(Profiler): 27 | """A no-op profiler. Use it instead of SimpleProfiler in case you want to disable profiling.""" 28 | 29 | def start_section(self, section_name: str) -> None: 30 | pass 31 | 32 | def end_section(self, section_name: str = "") -> None: 33 | pass 34 | 35 | def report(self, period_sec: int = 30) -> None: 36 | pass 37 | 38 | 39 | class SimpleProfiler(Profiler): 40 | def __init__(self, 41 | reset_after_sample_count: int = 0, 42 | enclosing_section_name: str = "total", 43 | printer: Callable[[str], Any] = print, 44 | report_sec: int = 30): 45 | self.printer = printer 46 | self.enclosing_section_name = enclosing_section_name 47 | self.reset_after_sample_count = reset_after_sample_count 48 | self.report_sec = report_sec 49 | 50 | self.section_to_stats: MutableMapping[str, ProfiledSectionStats] = {} 51 | self.last_report_timestamp_sec = time.time() 52 | self.last_started_section_name = "" 53 | 54 | def start_section(self, section_name: str) -> None: 55 | """Start measuring a section""" 56 | 57 | if not section_name: 58 | raise ValueError("Section name is empty") 59 | self.last_started_section_name = section_name 60 | 61 | section = self.section_to_stats.get(section_name, None) 62 | if section is None: 63 | section = ProfiledSectionStats(section_name) 64 | self.section_to_stats[section_name] = section 65 | 66 | if section.start_time_sec != 0: 67 | raise ValueError(f"Section {section_name} is already started") 68 | 69 | section.start_time_sec = time.time() 70 | 71 | def end_section(self, name: str = "") -> None: 72 | """End measuring a section. Leave section name empty to end the last started section.""" 73 | 74 | now = time.time() 75 | 76 | section_name = name 77 | if not name: 78 | section_name = self.last_started_section_name 79 | 80 | if not section_name: 81 | raise ValueError("Neither section name is specified nor a section is started") 82 | 83 | if section_name not in self.section_to_stats: 84 | raise ValueError(f"Section {section_name} does not exist") 85 | section = self.section_to_stats[section_name] 86 | 87 | if section.start_time_sec == 0: 88 | raise ValueError(f"Section {section_name} was not started") 89 | 90 | took_sec = now - section.start_time_sec 91 | if 0 < self.reset_after_sample_count == section.sample_count: 92 | section.sample_count_batch = 0 93 | section.total_time_sec_batch = 0 94 | 95 | section.sample_count += 1 96 | section.total_time_sec += took_sec 97 | section.sample_count_batch += 1 98 | section.total_time_sec_batch += took_sec 99 | section.start_time_sec = 0 100 | 101 | def report(self, period_sec: int = 30) -> None: 102 | """Print results using [printer] function. By default prints to stdout.""" 103 | if time.time() - self.last_report_timestamp_sec < period_sec: 104 | return 105 | 106 | enclosing_time_sec: Union[int, float] = 0 107 | if self.enclosing_section_name: 108 | if self.enclosing_section_name in self.section_to_stats: 109 | enclosing_time_sec = self.section_to_stats[self.enclosing_section_name].total_time_sec 110 | 111 | include_batch_rates = self.reset_after_sample_count > 0 112 | 113 | sections = self.section_to_stats.values() 114 | sorted_sections = sorted(sections, key=lambda it: it.total_time_sec, reverse=True) 115 | lines = map(lambda it: it.to_string(enclosing_time_sec, include_batch_rates), sorted_sections) 116 | text = os.linesep.join(lines) 117 | self.printer(text) 118 | 119 | self.last_report_timestamp_sec = time.time() 120 | 121 | 122 | class ProfiledSectionStats: 123 | def __init__(self, section_name: str, start_time_sec: Union[int, float] = 0, sample_count: int = 0, 124 | total_time_sec: Union[int, float] = 0, sample_count_batch: int = 0, 125 | total_time_sec_batch: Union[int, float] = 0) -> None: 126 | self.section_name = section_name 127 | self.start_time_sec = start_time_sec 128 | self.sample_count = sample_count 129 | self.total_time_sec = total_time_sec 130 | self.sample_count_batch = sample_count_batch 131 | self.total_time_sec_batch = total_time_sec_batch 132 | 133 | def to_string(self, enclosing_time_sec: Union[int, float], include_batch_rates: bool) -> str: 134 | took_sec_text = f"{self.total_time_sec:>8.2f} s" 135 | if enclosing_time_sec > 0: 136 | took_sec_text += f" ({100 * self.total_time_sec / enclosing_time_sec:>6.2f}%)" 137 | 138 | ms_per_k_samples = f"{1000000 * self.total_time_sec / self.sample_count: 7.2f}" 139 | 140 | if self.total_time_sec > 0: 141 | samples_per_sec = f"{self.sample_count / self.total_time_sec: 15,.2f}" 142 | else: 143 | samples_per_sec = "N/A" 144 | 145 | if include_batch_rates: 146 | ms_per_k_samples += f" ({1000000 * self.total_time_sec_batch / self.sample_count_batch: 7.2f})" 147 | if self.total_time_sec_batch > 0: 148 | samples_per_sec += f" ({self.sample_count_batch / self.total_time_sec_batch: 15,.2f})" 149 | else: 150 | samples_per_sec += " (N/A)" 151 | 152 | return f"{self.section_name: <15}: took {took_sec_text}, " \ 153 | f"{self.sample_count: >10,} samples, " \ 154 | f"{ms_per_k_samples} ms / 1000 samples, " \ 155 | f"{samples_per_sec} hz" 156 | -------------------------------------------------------------------------------- /drain3/jaccard_drain.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | # This file implements the Drain algorithm for log parsing. 3 | # Based on https://github.com/logpai/logparser/blob/master/logparser/Drain/Drain.py by LogPAI team 4 | 5 | from typing import Optional, Sequence, Tuple 6 | 7 | from drain3.drain import DrainBase, LogCluster, Node 8 | 9 | 10 | class JaccardDrain(DrainBase): 11 | """ 12 | add a new matching pattern to the log cluster. 13 | Cancels log message length as first token. 14 | Drain that uses Jaccard similarity to match log messages. 15 | """ 16 | 17 | def tree_search(self, 18 | root_node: Node, 19 | tokens: Sequence[str], 20 | sim_th: float, 21 | include_params: bool) -> Optional[LogCluster]: 22 | # at first level, children are grouped by token (The first word in tokens) 23 | token_count = len(tokens) 24 | # cur_node = root_node.key_to_child_node.get(str(token_count)) 25 | 26 | if not tokens: 27 | token_first = "" 28 | cur_node = root_node.key_to_child_node.get(token_first) 29 | else: 30 | token_first = tokens[0] 31 | cur_node = root_node.key_to_child_node.get(token_first) 32 | 33 | # no template with same token count yet 34 | if cur_node is None: 35 | return None 36 | 37 | # handle case of empty log string - return the single cluster in that group 38 | if token_count == 0: 39 | return self.id_to_cluster.get(cur_node.cluster_ids[0]) 40 | 41 | # find the leaf node for this log - a path of nodes matching the first N tokens (N=tree depth) 42 | cur_node_depth = 1 # first level is 1 43 | 44 | for token in tokens[1:]: 45 | # at max depth 46 | if cur_node_depth >= self.max_node_depth: 47 | break 48 | 49 | # this is last token 50 | # It starts with the second word, so the sentence length -1 51 | if cur_node_depth == token_count - 1: 52 | break 53 | 54 | key_to_child_node = cur_node.key_to_child_node 55 | cur_node = key_to_child_node.get(token) 56 | 57 | if cur_node is None: # no exact next token exist, try wildcard node 58 | cur_node = key_to_child_node.get(self.param_str) 59 | if cur_node is None: # no wildcard node exist 60 | return None 61 | 62 | cur_node_depth += 1 63 | 64 | # get best match among all clusters with same prefix, or None if no match is above sim_th 65 | cluster = self.fast_match(cur_node.cluster_ids, tokens, sim_th, include_params) 66 | 67 | return cluster 68 | 69 | def add_seq_to_prefix_tree(self, root_node: Node, cluster: LogCluster) -> None: 70 | token_count = len(cluster.log_template_tokens) 71 | # Determine if the string is empty 72 | if not cluster.log_template_tokens: 73 | token_first = "" 74 | else: 75 | token_first = cluster.log_template_tokens[0] 76 | if token_first not in root_node.key_to_child_node: 77 | first_layer_node = Node() 78 | root_node.key_to_child_node[token_first] = first_layer_node 79 | else: 80 | first_layer_node = root_node.key_to_child_node[token_first] 81 | 82 | cur_node = first_layer_node 83 | 84 | # handle case of empty log string 85 | if token_count == 0: 86 | cur_node.cluster_ids = [cluster.cluster_id] 87 | return 88 | 89 | # test_add_shorter_than_depth_message : only one word add into current node 90 | if token_count == 1: 91 | # clean up stale clusters before adding a new one. 92 | new_cluster_ids = [] 93 | for cluster_id in cur_node.cluster_ids: 94 | if cluster_id in self.id_to_cluster: 95 | new_cluster_ids.append(cluster_id) 96 | new_cluster_ids.append(cluster.cluster_id) 97 | cur_node.cluster_ids = new_cluster_ids 98 | 99 | current_depth = 1 100 | for token in cluster.log_template_tokens[1:]: 101 | # if at max depth or this is last token in template - add current log cluster to the leaf node 102 | # It starts with the second word, so the sentence length -1 103 | if current_depth >= self.max_node_depth or current_depth >= token_count - 1: 104 | # clean up stale clusters before adding a new one. 105 | new_cluster_ids = [] 106 | for cluster_id in cur_node.cluster_ids: 107 | if cluster_id in self.id_to_cluster: 108 | new_cluster_ids.append(cluster_id) 109 | new_cluster_ids.append(cluster.cluster_id) 110 | cur_node.cluster_ids = new_cluster_ids 111 | break 112 | 113 | # if token not matched in this layer of existing tree. 114 | if token not in cur_node.key_to_child_node: 115 | if self.parametrize_numeric_tokens and self.has_numbers(token): 116 | if self.param_str not in cur_node.key_to_child_node: 117 | new_node = Node() 118 | cur_node.key_to_child_node[self.param_str] = new_node 119 | cur_node = new_node 120 | else: 121 | cur_node = cur_node.key_to_child_node[self.param_str] 122 | 123 | else: 124 | if self.param_str in cur_node.key_to_child_node: 125 | if len(cur_node.key_to_child_node) < self.max_children: 126 | new_node = Node() 127 | cur_node.key_to_child_node[token] = new_node 128 | cur_node = new_node 129 | else: 130 | cur_node = cur_node.key_to_child_node[self.param_str] 131 | else: 132 | if len(cur_node.key_to_child_node) + 1 < self.max_children: 133 | new_node = Node() 134 | cur_node.key_to_child_node[token] = new_node 135 | cur_node = new_node 136 | elif len(cur_node.key_to_child_node) + 1 == self.max_children: 137 | new_node = Node() 138 | cur_node.key_to_child_node[self.param_str] = new_node 139 | cur_node = new_node 140 | else: 141 | cur_node = cur_node.key_to_child_node[self.param_str] 142 | 143 | # if the token is matched 144 | else: 145 | cur_node = cur_node.key_to_child_node[token] 146 | 147 | current_depth += 1 148 | 149 | # seq1 is a template, seq2 is the log to match 150 | def get_seq_distance(self, seq1: Sequence[str], seq2: Sequence[str], include_params: bool) -> Tuple[float, int]: 151 | # Jaccard index, It is used to measure the similarity of two sets. 152 | # The closer its value is to 1, the more common members the two sets have, and the higher the similarity. 153 | 154 | # sequences are empty - full match 155 | if len(seq1) == 0: 156 | return 1.0, 0 157 | 158 | param_count = 0 159 | 160 | for token1 in seq1: 161 | if token1 == self.param_str: 162 | param_count += 1 163 | 164 | # If the token and the data have the same length, and there are param_str in the token 165 | if len(seq1) == len(seq2) and param_count > 0: 166 | # seq2 removes the param_str position 167 | seq2 = [x for i, x in enumerate(seq2) if seq1[i] != self.param_str] 168 | 169 | # If there are param_str, they are removed from the coefficient calculation 170 | if include_params: 171 | seq1 = [x for x in seq1 if x != self.param_str] 172 | 173 | # Calculate the Jaccard coefficient 174 | ret_val = len(set(seq1) & set(seq2)) / len(set(seq1) | set(seq2)) 175 | 176 | # Jaccard coefficient calculated under the same conditions has a low simSep value 177 | # So gain is applied to the calculated value (The test case test_add_log_message_sim_75) 178 | ret_val = ret_val * 1.3 if ret_val * 1.3 < 1 else 1 179 | 180 | return ret_val, param_count 181 | 182 | # seq1:tonkens->list seq2:template->tuple 183 | def create_template(self, seq1: Sequence[str], seq2: Sequence[str]) -> Sequence[str]: 184 | 185 | inter_set = set(seq1) & set(seq2) 186 | 187 | # test_max_clusters_lru_multiple_leaf_nodes 188 | # Update param_str at different positions with the same length 189 | if len(seq1) == len(seq2): 190 | ret_val = list(seq2) 191 | for i, (token1, token2) in enumerate(zip(seq1, seq2)): 192 | if token1 != token2: 193 | ret_val[i] = self.param_str 194 | # param_str is updated at the new position with different length 195 | else: 196 | # Take the template with long length 197 | ret_val = list(seq1) if len(seq1) > len(seq2) else list(seq2) 198 | for i, token in enumerate(ret_val): 199 | if token not in inter_set: 200 | ret_val[i] = self.param_str 201 | 202 | return ret_val 203 | 204 | def match(self, content: str, full_search_strategy: str = "never") -> Optional[LogCluster]: 205 | 206 | assert full_search_strategy in ["always", "never", "fallback"] 207 | 208 | # Because the template length and data are not equal in length, Jaccard distance required_sim_th != 1 209 | required_sim_th = 0.8 210 | content_tokens = self.get_content_as_tokens(content) 211 | 212 | def full_search() -> Optional[LogCluster]: 213 | all_ids = self.get_clusters_ids_for_seq_len(content_tokens[0]) 214 | cluster = self.fast_match(all_ids, content_tokens, required_sim_th, include_params=True) 215 | return cluster 216 | 217 | if full_search_strategy == "always": 218 | return full_search() 219 | 220 | match_cluster = self.tree_search(self.root_node, content_tokens, required_sim_th, include_params=True) 221 | if match_cluster is not None: 222 | return match_cluster 223 | 224 | if full_search_strategy == "never": 225 | return None 226 | 227 | return full_search() 228 | 229 | -------------------------------------------------------------------------------- /tests/test_drain.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import unittest 4 | 5 | from drain3.drain import Drain, LogCluster 6 | 7 | 8 | class DrainTest(unittest.TestCase): 9 | 10 | def test_add_shorter_than_depth_message(self): 11 | model = Drain(depth=4) 12 | res = model.add_log_message("hello") 13 | print(res[1]) 14 | print(res[0]) 15 | self.assertEqual(res[1], "cluster_created") 16 | 17 | res = model.add_log_message("hello") 18 | print(res[1]) 19 | print(res[0]) 20 | self.assertEqual(res[1], "none") 21 | 22 | res = model.add_log_message("otherword") 23 | print(res[1]) 24 | print(res[0]) 25 | self.assertEqual(res[1], "cluster_created") 26 | 27 | self.assertEqual(2, len(model.id_to_cluster)) 28 | 29 | def test_add_log_message(self): 30 | model = Drain() 31 | entries = str.splitlines( 32 | """ 33 | Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth] 34 | Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth] 35 | Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2 36 | Dec 10 09:12:35 LabSZ sshd[24492]: Failed password for invalid user pi from 0.0.0.0 port 49289 ssh2 37 | Dec 10 09:12:44 LabSZ sshd[24501]: Failed password for invalid user ftpuser from 0.0.0.0 port 60836 ssh2 38 | Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth] 39 | """ 40 | ) 41 | expected = str.splitlines( 42 | """ 43 | Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth] 44 | Dec 10 <*> LabSZ <*> input_userauth_request: invalid user <*> [preauth] 45 | Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2 46 | Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2 47 | Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2 48 | Dec 10 <*> LabSZ <*> input_userauth_request: invalid user <*> [preauth] 49 | """ 50 | ) 51 | actual = [] 52 | 53 | for entry in entries: 54 | cluster, change_type = model.add_log_message(entry) 55 | actual.append(cluster.get_template()) 56 | 57 | self.assertListEqual(list(map(str.strip, expected)), actual) 58 | self.assertEqual(8, model.get_total_cluster_size()) 59 | 60 | def test_add_log_message_sim_75(self): 61 | """When `sim_th` is set to 75% then only certain log entries match. 62 | 63 | In this test similarity threshold is set to 75% which makes the model 64 | less aggressive in grouping entries into clusters. In particular, it 65 | only finds clusters for "Failed password" entries. 66 | """ 67 | model = Drain( 68 | depth=4, 69 | sim_th=0.75, 70 | max_children=100, 71 | ) 72 | entries = str.splitlines( 73 | """ 74 | Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth] 75 | Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth] 76 | Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2 77 | Dec 10 09:12:35 LabSZ sshd[24492]: Failed password for invalid user pi from 0.0.0.0 port 49289 ssh2 78 | Dec 10 09:12:44 LabSZ sshd[24501]: Failed password for invalid user ftpuser from 0.0.0.0 port 60836 ssh2 79 | Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth] 80 | """ 81 | ) 82 | expected = str.splitlines( 83 | """ 84 | Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth] 85 | Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth] 86 | Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2 87 | Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2 88 | Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2 89 | Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth] 90 | """ 91 | ) 92 | actual = [] 93 | 94 | for entry in entries: 95 | cluster, change_type = model.add_log_message(entry) 96 | actual.append(cluster.get_template()) 97 | 98 | self.assertListEqual(list(map(str.strip, expected)), actual) 99 | self.assertEqual(8, model.get_total_cluster_size()) 100 | 101 | def test_max_clusters(self): 102 | """Verify model respects the max_clusters option. 103 | 104 | Key difference between this and other tests is that with `max_clusters` 105 | set to 1 model is capable of keeping track of a single cluster at a 106 | time. Consequently, when log stream switched form the A format to the B 107 | and back model doesn't recognize it and returnes a new template with no 108 | slots. 109 | """ 110 | model = Drain(max_clusters=1) 111 | entries = str.splitlines( 112 | """ 113 | A format 1 114 | A format 2 115 | B format 1 116 | B format 2 117 | A format 3 118 | """ 119 | ) 120 | expected = str.splitlines( 121 | """ 122 | A format 1 123 | A format <*> 124 | B format 1 125 | B format <*> 126 | A format 3 127 | """ 128 | ) 129 | actual = [] 130 | 131 | for entry in entries: 132 | cluster, change_type = model.add_log_message(entry) 133 | actual.append(cluster.get_template()) 134 | 135 | self.assertListEqual(list(map(str.strip, expected)), actual) 136 | self.assertEqual(1, model.get_total_cluster_size()) 137 | 138 | def test_max_clusters_lru_multiple_leaf_nodes(self): 139 | """When all templates end up in different nodes and the max number of 140 | clusters is reached, then clusters are removed according to the lru 141 | policy. 142 | """ 143 | model = Drain(max_clusters=2, depth=4, param_str="*") # sim_th=0.75 144 | entries = [ 145 | "A A A", 146 | "A A B", 147 | "B A A", 148 | "B A B", 149 | "C A A", 150 | "C A B", 151 | "B A A", 152 | "A A A", 153 | ] 154 | expected = [ 155 | # lru: [] 156 | "A A A", 157 | # lru: ["A A A"] 158 | "A A *", 159 | # lru: ["A A *"] 160 | "B A A", 161 | # lru: ["B A A", "A A *"] 162 | "B A *", 163 | # lru: ["B A *", "A A *"] 164 | "C A A", 165 | # lru: ["C A A", "B A *"] 166 | "C A *", 167 | # lru: ["C A *", "B A *"] 168 | "B A *", 169 | # Message "B A A" was normalized because the template "B A *" is 170 | # still present in the cache. 171 | # lru: ["B A *", "C A *"] 172 | "A A A", 173 | # Message "A A A" was not normalized because the template "C A A" 174 | # pushed out the template "A A *" from the cache. 175 | # lru: ["A A A", "C A *"] 176 | ] 177 | actual = [] 178 | 179 | for entry in entries: 180 | cluster, _ = model.add_log_message(entry) 181 | actual.append(cluster.get_template()) 182 | 183 | self.assertListEqual(list(map(str.strip, expected)), actual) 184 | self.assertEqual(4, model.get_total_cluster_size()) 185 | 186 | def test_max_clusters_lru_single_leaf_node(self): 187 | """When all templates end up in the same leaf node and the max number of 188 | clusters is reached, then clusters are removed according to the lru 189 | policy. 190 | """ 191 | model = Drain(max_clusters=2, depth=4, param_str="*") 192 | entries = [ 193 | "A A A", 194 | "A A B", 195 | "A B A", 196 | "A B B", 197 | "A C A", 198 | "A C B", 199 | "A B A", 200 | "A A A", 201 | ] 202 | expected = [ 203 | # lru: [] 204 | "A A A", 205 | # lru: ["A A A"] 206 | "A A *", 207 | # lru: ["A A *"] 208 | "A B A", 209 | # lru: ["B A A", "A A *"] 210 | "A B *", 211 | # lru: ["B A *", "A A *"] 212 | "A C A", 213 | # lru: ["C A A", "B A *"] 214 | "A C *", 215 | # lru: ["C A *", "B A *"] 216 | "A B *", 217 | # Message "B A A" was normalized because the template "B A *" is 218 | # still present in the cache. 219 | # lru: ["B A *", "C A *"] 220 | "A A A", 221 | # Message "A A A" was not normalized because the template "C A A" 222 | # pushed out the template "A A *" from the cache. 223 | # lru: ["A A A", "C A *"] 224 | ] 225 | actual = [] 226 | 227 | for entry in entries: 228 | cluster, _ = model.add_log_message(entry) 229 | actual.append(cluster.get_template()) 230 | 231 | self.assertListEqual(list(map(str.strip, expected)), actual) 232 | # self.assertEqual(5, model.get_total_cluster_size()) 233 | 234 | def test_match_only(self): 235 | model = Drain() 236 | res = model.add_log_message("aa aa aa") 237 | print(res[0]) 238 | 239 | res = model.add_log_message("aa aa bb") 240 | print(res[0]) 241 | 242 | res = model.add_log_message("aa aa cc") 243 | print(res[0]) 244 | 245 | res = model.add_log_message("xx yy zz") 246 | print(res[0]) 247 | 248 | c: LogCluster = model.match("aa aa tt") 249 | self.assertEqual(1, c.cluster_id) 250 | 251 | c: LogCluster = model.match("xx yy zz") 252 | self.assertEqual(2, c.cluster_id) 253 | 254 | c: LogCluster = model.match("xx yy rr") 255 | self.assertIsNone(c) 256 | 257 | c: LogCluster = model.match("nothing") 258 | self.assertIsNone(c) 259 | 260 | def test_create_template(self): 261 | model = Drain(param_str="*") 262 | 263 | seq1 = ["aa", "bb", "dd"] 264 | seq2 = ["aa", "bb", "cc"] 265 | 266 | # test for proper functionality 267 | template = model.create_template(seq1, seq2) 268 | self.assertListEqual(["aa", "bb", "*"], template) 269 | 270 | template = model.create_template(seq1, seq1) 271 | self.assertListEqual(seq1, template) 272 | 273 | # Test for equal lengths input vectors 274 | self.assertRaises(AssertionError, model.create_template, seq1, ["aa"]) -------------------------------------------------------------------------------- /tests/test_jaccard_drain.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import unittest 4 | 5 | from drain3.drain import LogCluster 6 | from drain3.jaccard_drain import JaccardDrain 7 | 8 | 9 | class DrainTest(unittest.TestCase): 10 | 11 | def test_add_shorter_than_depth_message(self): 12 | model = JaccardDrain(depth=4) 13 | res = model.add_log_message("hello") 14 | print(res[1]) 15 | print(res[0]) 16 | self.assertEqual(res[1], "cluster_created") 17 | 18 | res = model.add_log_message("hello") 19 | print(res[1]) 20 | print(res[0]) 21 | self.assertEqual(res[1], "none") 22 | 23 | res = model.add_log_message("otherword") 24 | print(res[1]) 25 | print(res[0]) 26 | self.assertEqual(res[1], "cluster_created") 27 | 28 | self.assertEqual(2, len(model.id_to_cluster)) 29 | 30 | def test_add_log_message(self): 31 | model = JaccardDrain() 32 | entries = str.splitlines( 33 | """ 34 | Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth] 35 | Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth] 36 | Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2 37 | Dec 10 09:12:35 LabSZ sshd[24492]: Failed password for invalid user pi from 0.0.0.0 port 49289 ssh2 38 | Dec 10 09:12:44 LabSZ sshd[24501]: Failed password for invalid user ftpuser from 0.0.0.0 port 60836 ssh2 39 | Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth] 40 | """ 41 | ) 42 | expected = str.splitlines( 43 | """ 44 | Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth] 45 | Dec 10 <*> LabSZ <*> input_userauth_request: invalid user <*> [preauth] 46 | Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2 47 | Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2 48 | Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2 49 | Dec 10 <*> LabSZ <*> input_userauth_request: invalid user <*> [preauth] 50 | """ 51 | ) 52 | actual = [] 53 | 54 | for entry in entries: 55 | cluster, change_type = model.add_log_message(entry) 56 | actual.append(cluster.get_template()) 57 | 58 | self.assertListEqual(list(map(str.strip, expected)), actual) 59 | self.assertEqual(8, model.get_total_cluster_size()) 60 | 61 | def test_add_log_message_sim_75(self): 62 | """When `sim_th` is set to 75% then only certain log entries match. 63 | 64 | In this test similarity threshold is set to 75% which makes the model 65 | less aggressive in grouping entries into clusters. In particular, it 66 | only finds clusters for "Failed password" entries. 67 | """ 68 | model = JaccardDrain( 69 | depth=4, 70 | sim_th=0.75, 71 | max_children=100, 72 | ) 73 | entries = str.splitlines( 74 | """ 75 | Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth] 76 | Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth] 77 | Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2 78 | Dec 10 09:12:35 LabSZ sshd[24492]: Failed password for invalid user pi from 0.0.0.0 port 49289 ssh2 79 | Dec 10 09:12:44 LabSZ sshd[24501]: Failed password for invalid user ftpuser from 0.0.0.0 port 60836 ssh2 80 | Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth] 81 | """ 82 | ) 83 | expected = str.splitlines( 84 | """ 85 | Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth] 86 | Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth] 87 | Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2 88 | Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2 89 | Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2 90 | Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth] 91 | """ 92 | ) 93 | actual = [] 94 | 95 | for entry in entries: 96 | cluster, change_type = model.add_log_message(entry) 97 | actual.append(cluster.get_template()) 98 | 99 | self.assertListEqual(list(map(str.strip, expected)), actual) 100 | self.assertEqual(8, model.get_total_cluster_size()) 101 | 102 | def test_max_clusters(self): 103 | """Verify model respects the max_clusters option. 104 | 105 | Key difference between this and other tests is that with `max_clusters` 106 | set to 1 model is capable of keeping track of a single cluster at a 107 | time. Consequently, when log stream switched form the A format to the B 108 | and back model doesn't recognize it and returnes a new template with no 109 | slots. 110 | """ 111 | model = JaccardDrain(max_clusters=1) 112 | entries = str.splitlines( 113 | """ 114 | A format 1 115 | A format 2 116 | B format 1 117 | B format 2 118 | A format 3 119 | """ 120 | ) 121 | expected = str.splitlines( 122 | """ 123 | A format 1 124 | A format <*> 125 | B format 1 126 | B format <*> 127 | A format 3 128 | """ 129 | ) 130 | actual = [] 131 | 132 | for entry in entries: 133 | cluster, change_type = model.add_log_message(entry) 134 | actual.append(cluster.get_template()) 135 | 136 | self.assertListEqual(list(map(str.strip, expected)), actual) 137 | self.assertEqual(1, model.get_total_cluster_size()) 138 | 139 | def test_max_clusters_lru_multiple_leaf_nodes(self): 140 | """When all templates end up in different nodes and the max number of 141 | clusters is reached, then clusters are removed according to the lru 142 | policy. 143 | """ 144 | model = JaccardDrain(max_clusters=2, depth=4, param_str="*") # sim_th=0.75 145 | entries = [ 146 | "A A A", 147 | "A A B", 148 | "B A A", 149 | "B A B", 150 | "C A A", 151 | "C A B", 152 | "B A A", 153 | "A A A", 154 | ] 155 | expected = [ 156 | # lru: [] 157 | "A A A", 158 | # lru: ["A A A"] 159 | "A A *", 160 | # lru: ["A A *"] 161 | "B A A", 162 | # lru: ["B A A", "A A *"] 163 | "B A *", 164 | # lru: ["B A *", "A A *"] 165 | "C A A", 166 | # lru: ["C A A", "B A *"] 167 | "C A *", 168 | # lru: ["C A *", "B A *"] 169 | "B A *", 170 | # Message "B A A" was normalized because the template "B A *" is 171 | # still present in the cache. 172 | # lru: ["B A *", "C A *"] 173 | "A A A", 174 | # Message "A A A" was not normalized because the template "C A A" 175 | # pushed out the template "A A *" from the cache. 176 | # lru: ["A A A", "C A *"] 177 | ] 178 | actual = [] 179 | 180 | for entry in entries: 181 | cluster, _ = model.add_log_message(entry) 182 | actual.append(cluster.get_template()) 183 | print(cluster.get_template()) 184 | 185 | self.assertListEqual(list(map(str.strip, expected)), actual) 186 | self.assertEqual(4, model.get_total_cluster_size()) 187 | 188 | def test_max_clusters_lru_single_leaf_node(self): 189 | """When all templates end up in the same leaf node and the max number of 190 | clusters is reached, then clusters are removed according to the lru 191 | policy. 192 | """ 193 | model = JaccardDrain(max_clusters=2, depth=4, param_str="*") 194 | entries = [ 195 | "A A A", 196 | "A A B", 197 | "A B A", 198 | "A B B", 199 | "A C A", 200 | "A C B", 201 | "A B A", 202 | "A A A", 203 | ] 204 | expected = [ 205 | # lru: [] 206 | "A A A", 207 | # lru: ["A A A"] 208 | "A A *", 209 | # lru: ["A A *"] 210 | "A B A", 211 | # lru: ["B A A", "A A *"] 212 | "A B *", 213 | # lru: ["B A *", "A A *"] 214 | "A C A", 215 | # lru: ["C A A", "B A *"] 216 | "A C *", 217 | # lru: ["C A *", "B A *"] 218 | "A B *", 219 | # Message "B A A" was normalized because the template "B A *" is 220 | # still present in the cache. 221 | # lru: ["B A *", "C A *"] 222 | "A A A", 223 | # Message "A A A" was not normalized because the template "C A A" 224 | # pushed out the template "A A *" from the cache. 225 | # lru: ["A A A", "C A *"] 226 | ] 227 | actual = [] 228 | 229 | for entry in entries: 230 | cluster, _ = model.add_log_message(entry) 231 | actual.append(cluster.get_template()) 232 | 233 | self.assertListEqual(list(map(str.strip, expected)), actual) 234 | # self.assertEqual(5, model.get_total_cluster_size()) 235 | 236 | def test_match_only(self): 237 | model = JaccardDrain() 238 | res = model.add_log_message("aa aa aa") 239 | print(res[0]) 240 | 241 | res = model.add_log_message("aa aa bb") 242 | print(res[0]) 243 | 244 | res = model.add_log_message("aa aa cc") 245 | print(res[0]) 246 | 247 | res = model.add_log_message("xx yy zz") 248 | print(res[0]) 249 | 250 | c: LogCluster = model.match("aa aa tt") 251 | self.assertEqual(1, c.cluster_id) 252 | 253 | c: LogCluster = model.match("xx yy zz") 254 | self.assertEqual(2, c.cluster_id) 255 | 256 | c: LogCluster = model.match("xx yy rr") 257 | self.assertIsNone(c) 258 | 259 | c: LogCluster = model.match("nothing") 260 | self.assertIsNone(c) 261 | 262 | def test_match_token_with_different_length(self): 263 | model = JaccardDrain() 264 | res = model.add_log_message("check pass; user unknown") 265 | print(res[0]) 266 | 267 | res = model.add_log_message("check pass; user Lisa") 268 | print(res[0]) 269 | 270 | res = model.add_log_message("check pass; user li Sa") 271 | print(res[0]) 272 | 273 | res = model.add_log_message("session opened for user cyrus by (uid=0)") 274 | print(res[0]) 275 | 276 | res = model.add_log_message("session closed for user cyrus") 277 | print(res[0]) 278 | 279 | c: LogCluster = model.match("check pass; user boris") 280 | self.assertEqual(1, c.cluster_id) 281 | 282 | c: LogCluster = model.match("session opened for user cyrus by (uid=1)") 283 | self.assertEqual(2, c.cluster_id) 284 | 285 | c: LogCluster = model.match("nothing") 286 | self.assertIsNone(c) 287 | 288 | 289 | if __name__ == "__main__": 290 | pass 291 | -------------------------------------------------------------------------------- /tests/test_template_miner.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import io 4 | import logging 5 | import sys 6 | import unittest 7 | from os.path import dirname 8 | 9 | from drain3 import TemplateMiner 10 | from drain3.masking import MaskingInstruction 11 | from drain3.memory_buffer_persistence import MemoryBufferPersistence 12 | from drain3.template_miner_config import TemplateMinerConfig 13 | 14 | 15 | class TemplateMinerTest(unittest.TestCase): 16 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') 17 | 18 | def test_load_config(self): 19 | config = TemplateMinerConfig() 20 | config.load(f"{dirname(__file__)}/drain3_test.ini") 21 | self.assertEqual(1024, config.drain_max_clusters) 22 | self.assertListEqual(["_"], config.drain_extra_delimiters) 23 | self.assertEqual(7, len(config.masking_instructions)) 24 | 25 | def test_save_load_snapshot_unlimited_clusters(self): 26 | self.save_load_snapshot(None) 27 | 28 | def test_save_load_snapshot_limited_clusters(self): 29 | self.save_load_snapshot(10) 30 | 31 | def save_load_snapshot(self, max_clusters): 32 | persistence = MemoryBufferPersistence() 33 | 34 | config = TemplateMinerConfig() 35 | config.drain_max_clusters = max_clusters 36 | template_miner1 = TemplateMiner(persistence, config) 37 | print(template_miner1.add_log_message("hello")) 38 | print(template_miner1.add_log_message("hello ABC")) 39 | print(template_miner1.add_log_message("hello BCD")) 40 | print(template_miner1.add_log_message("hello XYZ")) 41 | print(template_miner1.add_log_message("goodbye XYZ")) 42 | 43 | template_miner2 = TemplateMiner(persistence, config) 44 | 45 | self.assertListEqual(list(template_miner1.drain.id_to_cluster.keys()), 46 | list(template_miner2.drain.id_to_cluster.keys())) 47 | 48 | self.assertListEqual(list(template_miner1.drain.root_node.key_to_child_node.keys()), 49 | list(template_miner2.drain.root_node.key_to_child_node.keys())) 50 | 51 | def get_tree_lines(template_miner): 52 | sio = io.StringIO() 53 | template_miner.drain.print_tree(sio) 54 | sio.seek(0) 55 | return sio.readlines() 56 | 57 | self.assertListEqual(get_tree_lines(template_miner1), 58 | get_tree_lines(template_miner2)) 59 | 60 | print(template_miner2.add_log_message("hello yyy")) 61 | print(template_miner2.add_log_message("goodbye ABC")) 62 | 63 | def test_extract_parameters(self): 64 | config = TemplateMinerConfig() 65 | mi = MaskingInstruction("((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM") 66 | config.masking_instructions.append(mi) 67 | mi = MaskingInstruction(r"multiple words", "WORDS") 68 | config.masking_instructions.append(mi) 69 | config.mask_prefix = "[:" 70 | config.mask_suffix = ":]" 71 | template_miner = TemplateMiner(None, config) 72 | 73 | def add_and_test(msg, expected_params, exact_matching=False): 74 | print(f"msg: {msg}") 75 | res = template_miner.add_log_message(msg) 76 | print(f"result: {res}") 77 | extracted_parameters = template_miner.extract_parameters( 78 | res["template_mined"], msg, exact_matching=exact_matching) 79 | self.assertIsNotNone(extracted_parameters) 80 | params = [parameter.value for parameter in extracted_parameters] 81 | print(f"params: {params}") 82 | self.assertListEqual(params, expected_params) 83 | 84 | add_and_test("hello", []) 85 | add_and_test("hello ABC", []) 86 | add_and_test("hello BCD", ["BCD"]) 87 | add_and_test("hello BCD", ["BCD"]) 88 | add_and_test("hello\tBCD", ["BCD"]) 89 | add_and_test("request took 123 ms", ["123"]) 90 | add_and_test("file saved [test.xml]", []) 91 | add_and_test("new order received: [:xyz:]", []) 92 | add_and_test("order type: new, order priority:3", ["3"]) 93 | add_and_test("order type: changed, order priority:5", ["changed,", "5"]) 94 | add_and_test("sometimes one needs multiple words", ["multiple words"], True) 95 | add_and_test("sometimes one needs not", ["not"], True) 96 | add_and_test("sometimes one needs multiple words", ["multiple words"], True) 97 | 98 | def test_extract_parameters_direct(self): 99 | config = TemplateMinerConfig() 100 | mi = MaskingInstruction(r"hdfs://[\w.:@-]*((/[\w.~%+-]+)+/?)?", "hdfs_uri") 101 | config.masking_instructions.append(mi) 102 | mi = MaskingInstruction(r"(?P[\"'`]).*?(?P=quote)", "quoted_string") 103 | config.masking_instructions.append(mi) 104 | mi = MaskingInstruction(r"((?P[*_])\2{0,2}).*?\1", "markdown_emph") 105 | config.masking_instructions.append(mi) 106 | mi = MaskingInstruction(r"multiple \*word\* pattern", "*words*") 107 | config.masking_instructions.append(mi) 108 | mi = MaskingInstruction(r"some \S+ \S+ pattern", "*words*") 109 | config.masking_instructions.append(mi) 110 | mi = MaskingInstruction(r"(\d{1,3}\.){3}\d{1,3}", "ip") 111 | config.masking_instructions.append(mi) 112 | mi = MaskingInstruction(r"(?P\d+)\.\d+", "float") 113 | config.masking_instructions.append(mi) 114 | mi = MaskingInstruction(r"0[xX][a-fA-F0-9]+", "integer") 115 | config.masking_instructions.append(mi) 116 | mi = MaskingInstruction(r"(?P\d+)", "integer") 117 | config.masking_instructions.append(mi) 118 | mi = MaskingInstruction(r"HelloWorld", "*") 119 | config.masking_instructions.append(mi) 120 | mi = MaskingInstruction(r"MaskPrefix", "<") 121 | config.masking_instructions.append(mi) 122 | template_miner = TemplateMiner(None, config) 123 | 124 | test_vectors = [ 125 | ( 126 | ":+", 127 | "hdfs://msra-sa-41:9000/pageinput2.txt:671088640+134217728", 128 | ["hdfs://msra-sa-41:9000/pageinput2.txt", "671088640", "134217728"], 129 | ["hdfs_uri", "integer", "integer"] 130 | ), 131 | ( 132 | "Hello ", 133 | "Hello 'World'", 134 | ["'World'"], 135 | ["quoted_string"] 136 | ), 137 | ( 138 | "", 139 | """'This "should"'`do no breakin'`""", 140 | ["""'This "should"'""", "`do no breakin'`"], 141 | ["quoted_string", "quoted_string"] 142 | ), 143 | ( 144 | "This is !.", 145 | "This is ___very___ *important*!.", 146 | ["___very___", "*important*"], 147 | ["markdown_emph", "markdown_emph"] 148 | ), 149 | ( 150 | ".<*>", 151 | "0.15.Test", 152 | ["0.15", "Test"], 153 | ["float", "*"] 154 | ), 155 | ( 156 | ":", 157 | "192.0.0.1:5000", 158 | ["192.0.0.1", "5000"], 159 | ["ip", "integer"] 160 | ), 161 | ( 162 | "::", 163 | "192.0.0.1:5000:123", 164 | ["192.0.0.1", "5000", "123"], 165 | ["ip", "integer", "integer"] 166 | ), 167 | ( 168 | ".<*>.", 169 | "0.15.Test.0.2", 170 | ["0.15", "Test", "0.2"], 171 | ["float", "*", "float"] 172 | ), 173 | ( 174 | " ", 175 | "0.15 10.16", 176 | ["0.15", "10.16"], 177 | ["float", "float"] 178 | ), 179 | ( 180 | "<*words*>@", 181 | "some other cool pattern@0xe1f", 182 | ["some other cool pattern", "0xe1f"], 183 | ["*words*", "integer"] 184 | ), 185 | ( 186 | "Another test with <*words*> that includes and <*> ", 187 | "Another test with some other 0Xadded pattern that includes 500xc0ffee and 0X4 times 5", 188 | ["some other 0Xadded pattern", "50", "0xc0ffee", "0X4", "times", "5"], 189 | ["*words*", "integer", "integer", "integer", "*", "integer"] 190 | ), 191 | ( 192 | "some <*words*> <*words*>", 193 | "some multiple *word* pattern some confusing *word* pattern", 194 | ["multiple *word* pattern", "some confusing *word* pattern"], 195 | ["*words*", "*words*"] 196 | ), 197 | ( 198 | "<*words*> <*>", 199 | "multiple *word* pattern <*words*>", 200 | ["multiple *word* pattern", "<*words*>"], 201 | ["*words*", "*"] 202 | ), 203 | ( 204 | "<*> <*>", 205 | "HelloWorld Test", 206 | ["HelloWorld", "Test"], 207 | ["*", "*"] 208 | ), 209 | ( 210 | "<*> <*>", 211 | "HelloWorld ", 212 | ["HelloWorld", ""], 213 | ["*", "*"] 214 | ), 215 | ( 216 | "<*>", 217 | "HelloWorld1", 218 | ["HelloWorld", "1"], 219 | ["*", "integer"] 220 | ), 221 | ( 222 | "<*> works <*>", 223 | "This works as-expected", 224 | ["This", "as-expected"], 225 | ["*", "*"] 226 | ), 227 | ( 228 | ">", 229 | "", 230 | ["8"], 231 | ["integer"] 232 | ), 233 | ( 234 | " >>", 235 | ">", 236 | ["8", "0.5"], 237 | ["integer", "float"] 238 | ), 239 | ( 240 | "<*> >>", 241 | "New: >", 242 | ["New:", "8", "0.5"], 243 | ["*", "integer", "float"] 244 | ), 245 | ( 246 | "<<>", 247 | "MaskPrefix", 248 | ["MaskPrefix"], 249 | ["<"] 250 | ), 251 | ( 252 | "<<<>>", 253 | "", 254 | ["MaskPrefix"], 255 | ["<"] 256 | ), 257 | ( 258 | "There are no parameters here.", 259 | "There are no parameters here.", 260 | [], 261 | [] 262 | ), 263 | ( 264 | " ", 265 | "0.15 10.16 3.19", 266 | None, 267 | None 268 | ), 269 | ( 270 | " ", 271 | "0.15 10.16 test 3.19", 272 | None, 273 | None 274 | ), 275 | ( 276 | " >>", 277 | ">", 278 | None, 279 | None 280 | ), 281 | ( 282 | "<<>", 283 | "<<>", 284 | None, 285 | None 286 | ), 287 | ( 288 | "<*words*> <*words*>", 289 | "0.15 0.15", 290 | None, 291 | None 292 | ), 293 | ] 294 | 295 | for template, content, expected_parameters, expected_mask_names in test_vectors: 296 | with self.subTest(template=template, content=content, expected_parameters=expected_parameters): 297 | extracted_parameters = template_miner.extract_parameters(template, content, exact_matching=True) 298 | if expected_parameters is None: 299 | self.assertIsNone(extracted_parameters) 300 | else: 301 | self.assertIsNotNone(extracted_parameters) 302 | self.assertListEqual([parameter.value for parameter in extracted_parameters], 303 | expected_parameters) 304 | self.assertListEqual([parameter.mask_name for parameter in extracted_parameters], 305 | expected_mask_names) 306 | 307 | def test_match_only(self): 308 | config = TemplateMinerConfig() 309 | config.drain_extra_delimiters = ["_"] 310 | mi = MaskingInstruction("((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM") 311 | config.masking_instructions.append(mi) 312 | tm = TemplateMiner(None, config) 313 | 314 | res = tm.add_log_message("aa aa aa") 315 | print(res) 316 | 317 | res = tm.add_log_message("aa aa bb") 318 | print(res) 319 | 320 | res = tm.add_log_message("xx yy zz") 321 | print(res) 322 | 323 | res = tm.add_log_message("rrr qqq 123") 324 | print(res) 325 | 326 | c = tm.match("aa aa tt") 327 | self.assertEqual(1, c.cluster_id) 328 | 329 | c = tm.match("aa aa 12") 330 | self.assertEqual(1, c.cluster_id) 331 | 332 | c = tm.match("xx yy zz") 333 | self.assertEqual(2, c.cluster_id) 334 | 335 | c = tm.match("xx yy rr") 336 | self.assertIsNone(c) 337 | 338 | c = tm.match("nothing") 339 | self.assertIsNone(c) 340 | 341 | c = tm.match("rrr qqq 456 ") 342 | self.assertEqual(3, c.cluster_id) 343 | 344 | c = tm.match("rrr qqq 555.2") 345 | self.assertIsNone(c) 346 | 347 | c = tm.match("rrr qqq num") 348 | self.assertIsNone(c) 349 | 350 | def test_match_strategies(self): 351 | miner = TemplateMiner() 352 | print(miner.add_log_message("training4Model start")) 353 | print(miner.add_log_message("loadModel start")) 354 | print(miner.add_log_message("loadModel stop")) 355 | print(miner.add_log_message("this is a test")) 356 | miner.drain.print_tree() 357 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="fallback")) 358 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="always")) 359 | self.assertIsNone(miner.match("loadModel start", full_search_strategy="never")) 360 | print(miner.add_log_message("loadModel start")) 361 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="fallback")) 362 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="always")) 363 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="never")) 364 | 365 | config = TemplateMinerConfig() 366 | config.parametrize_numeric_tokens = False 367 | miner = TemplateMiner(config=config) 368 | print(miner.add_log_message("training4Model start")) 369 | print(miner.add_log_message("loadModel start")) 370 | print(miner.add_log_message("loadModel stop")) 371 | print(miner.add_log_message("this is a test")) 372 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="fallback")) 373 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="always")) 374 | self.assertIsNotNone(miner.match("loadModel start", full_search_strategy="never")) 375 | 376 | self.assertIsNone(miner.match("", full_search_strategy="never")) 377 | self.assertIsNone(miner.match("", full_search_strategy="always")) 378 | self.assertIsNone(miner.match("", full_search_strategy="fallback")) 379 | 380 | print(miner.add_log_message("")) 381 | self.assertIsNotNone(miner.match("", full_search_strategy="never")) 382 | self.assertIsNotNone(miner.match("", full_search_strategy="always")) 383 | self.assertIsNotNone(miner.match("", full_search_strategy="fallback")) 384 | -------------------------------------------------------------------------------- /drain3/template_miner.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | 3 | import base64 4 | import logging 5 | import re 6 | import time 7 | import zlib 8 | from typing import Optional, Mapping, MutableMapping, NamedTuple, Sequence, Tuple, Union 9 | 10 | import jsonpickle # type: ignore[import] 11 | from cachetools import LRUCache, cachedmethod 12 | 13 | from drain3.drain import Drain, DrainBase, LogCluster 14 | from drain3.masking import LogMasker 15 | from drain3.persistence_handler import PersistenceHandler 16 | from drain3.simple_profiler import SimpleProfiler, NullProfiler, Profiler 17 | from drain3.template_miner_config import TemplateMinerConfig 18 | 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | config_filename = 'drain3.ini' 23 | 24 | ExtractedParameter = NamedTuple("ExtractedParameter", [("value", str), ("mask_name", str)]) 25 | 26 | 27 | class TemplateMiner: 28 | 29 | def __init__(self, 30 | persistence_handler: Optional[PersistenceHandler] = None, 31 | config: Optional[TemplateMinerConfig] = None): 32 | """ 33 | Wrapper for Drain with persistence and masking support 34 | :param persistence_handler: The type of persistence to use. When None, no persistence is applied. 35 | :param config: Configuration object. When none, configuration is loaded from default .ini file (if exist) 36 | """ 37 | logger.info("Starting Drain3 template miner") 38 | 39 | if config is None: 40 | logger.info(f"Loading configuration from {config_filename}") 41 | config = TemplateMinerConfig() 42 | config.load(config_filename) 43 | 44 | self.config = config 45 | 46 | self.profiler: Profiler = NullProfiler() 47 | 48 | if self.config.profiling_enabled: 49 | self.profiler = SimpleProfiler() 50 | 51 | self.persistence_handler = persistence_handler 52 | 53 | param_str = f"{self.config.mask_prefix}*{self.config.mask_suffix}" 54 | 55 | # Follow the configuration in the configuration file to instantiate Drain 56 | # target_obj will be "Drain" if the engine argument is not specified. 57 | target_obj = self.config.engine 58 | if target_obj not in ["Drain", "JaccardDrain"]: 59 | raise ValueError(f"Invalid matched_pattern: {target_obj}, must be either 'Drain' or 'JaccardDrain'") 60 | 61 | self.drain: DrainBase = globals()[target_obj]( 62 | sim_th=self.config.drain_sim_th, 63 | depth=self.config.drain_depth, 64 | max_children=self.config.drain_max_children, 65 | max_clusters=self.config.drain_max_clusters, 66 | extra_delimiters=self.config.drain_extra_delimiters, 67 | profiler=self.profiler, 68 | param_str=param_str, 69 | parametrize_numeric_tokens=self.config.parametrize_numeric_tokens 70 | ) 71 | 72 | self.masker = LogMasker(self.config.masking_instructions, self.config.mask_prefix, self.config.mask_suffix) 73 | self.parameter_extraction_cache: MutableMapping[Tuple[str, bool], str] = \ 74 | LRUCache(self.config.parameter_extraction_cache_capacity) 75 | self.last_save_time = time.time() 76 | 77 | if persistence_handler is not None: 78 | self.load_state() 79 | 80 | def load_state(self) -> None: 81 | logger.info("Checking for saved state") 82 | 83 | assert self.persistence_handler is not None 84 | 85 | state = self.persistence_handler.load_state() 86 | if state is None: 87 | logger.info("Saved state not found") 88 | return 89 | 90 | if self.config.snapshot_compress_state: 91 | state = zlib.decompress(base64.b64decode(state)) 92 | 93 | loaded_drain: Drain = jsonpickle.loads(state, keys=True) 94 | 95 | # json-pickle encoded keys as string by default, so we have to convert those back to int 96 | # this is only relevant for backwards compatibility when loading a snapshot of drain <= v0.9.1 97 | # which did not use json-pickle's keys=true 98 | if len(loaded_drain.id_to_cluster) > 0 and isinstance(next(iter(loaded_drain.id_to_cluster.keys())), str): 99 | loaded_drain.id_to_cluster = {int(k): v for k, v in list(loaded_drain.id_to_cluster.items())} 100 | if self.config.drain_max_clusters: 101 | cache: MutableMapping[int, Optional[LogCluster]] = LRUCache(maxsize=self.config.drain_max_clusters) 102 | cache.update(loaded_drain.id_to_cluster) 103 | loaded_drain.id_to_cluster = cache 104 | 105 | self.drain.id_to_cluster = loaded_drain.id_to_cluster 106 | self.drain.clusters_counter = loaded_drain.clusters_counter 107 | self.drain.root_node = loaded_drain.root_node 108 | 109 | logger.info(f"Restored {len(loaded_drain.clusters)} clusters " 110 | f"built from {loaded_drain.get_total_cluster_size()} messages") 111 | 112 | def save_state(self, snapshot_reason: str) -> None: 113 | assert self.persistence_handler is not None 114 | 115 | state = jsonpickle.dumps(self.drain, keys=True).encode('utf-8') 116 | if self.config.snapshot_compress_state: 117 | state = base64.b64encode(zlib.compress(state)) 118 | 119 | logger.info(f"Saving state of {len(self.drain.clusters)} clusters " 120 | f"with {self.drain.get_total_cluster_size()} messages, {len(state)} bytes, " 121 | f"reason: {snapshot_reason}") 122 | self.persistence_handler.save_state(state) 123 | 124 | def get_snapshot_reason(self, change_type: str, cluster_id: int) -> Optional[str]: 125 | if change_type != "none": 126 | return f"{change_type} ({cluster_id})" 127 | 128 | diff_time_sec = time.time() - self.last_save_time 129 | if diff_time_sec >= self.config.snapshot_interval_minutes * 60: 130 | return "periodic" 131 | 132 | return None 133 | 134 | def add_log_message(self, log_message: str) -> Mapping[str, Union[str, int]]: 135 | self.profiler.start_section("total") 136 | 137 | self.profiler.start_section("mask") 138 | masked_content = self.masker.mask(log_message) 139 | self.profiler.end_section() 140 | 141 | self.profiler.start_section("drain") 142 | cluster, change_type = self.drain.add_log_message(masked_content) 143 | self.profiler.end_section("drain") 144 | result: Mapping[str, Union[str, int]] = { 145 | "change_type": change_type, 146 | "cluster_id": cluster.cluster_id, 147 | "cluster_size": cluster.size, 148 | "template_mined": cluster.get_template(), 149 | "cluster_count": len(self.drain.clusters) 150 | } 151 | 152 | if self.persistence_handler is not None: 153 | self.profiler.start_section("save_state") 154 | snapshot_reason = self.get_snapshot_reason(change_type, cluster.cluster_id) 155 | if snapshot_reason: 156 | self.save_state(snapshot_reason) 157 | self.last_save_time = time.time() 158 | self.profiler.end_section() 159 | 160 | self.profiler.end_section("total") 161 | self.profiler.report(self.config.profiling_report_sec) 162 | return result 163 | 164 | def match(self, log_message: str, full_search_strategy: str = "never") -> Optional[LogCluster]: 165 | """ 166 | Mask log message and match against an already existing cluster. 167 | Match shall be perfect (sim_th=1.0). 168 | New cluster will not be created as a result of this call, nor any cluster modifications. 169 | 170 | :param log_message: log message to match 171 | :param full_search_strategy: when to perform full cluster search. 172 | (1) "never" is the fastest, will always perform a tree search [O(log(n)] but might produce 173 | false negatives (wrong mismatches) on some edge cases; 174 | (2) "fallback" will perform a linear search [O(n)] among all clusters with the same token count, but only in 175 | case tree search found no match. 176 | It should not have false negatives, however tree-search may find a non-optimal match with 177 | more wildcard parameters than necessary; 178 | (3) "always" is the slowest. It will select the best match among all known clusters, by always evaluating 179 | all clusters with the same token count, and selecting the cluster with perfect all token match and least 180 | count of wildcard matches. 181 | :return: Matched cluster or None if no match found. 182 | """ 183 | 184 | masked_content = self.masker.mask(log_message) 185 | matched_cluster = self.drain.match(masked_content, full_search_strategy) 186 | return matched_cluster 187 | 188 | def get_parameter_list(self, log_template: str, log_message: str) -> Sequence[str]: 189 | """ 190 | Extract parameters from a log message according to a provided template that was generated 191 | by calling `add_log_message()`. 192 | 193 | This function is deprecated. Please use extract_parameters instead. 194 | 195 | :param log_template: log template corresponding to the log message 196 | :param log_message: log message to extract parameters from 197 | :return: An ordered list of parameter values present in the log message. 198 | """ 199 | 200 | extracted_parameters = self.extract_parameters(log_template, log_message, exact_matching=False) 201 | if not extracted_parameters: 202 | return [] 203 | return [parameter.value for parameter in extracted_parameters] 204 | 205 | def extract_parameters(self, 206 | log_template: str, 207 | log_message: str, 208 | exact_matching: bool = True) -> Optional[Sequence[ExtractedParameter]]: 209 | """ 210 | Extract parameters from a log message according to a provided template that was generated 211 | by calling `add_log_message()`. 212 | 213 | For most accurate results, it is recommended that 214 | - Each `MaskingInstruction` has a unique `mask_with` value, 215 | - No `MaskingInstruction` has a `mask_with` value of `*`, 216 | - The regex-patterns of `MaskingInstruction` do not use unnamed back-references; 217 | instead use back-references to named groups e.g. `(?P=some-name)`. 218 | 219 | :param log_template: log template corresponding to the log message 220 | :param log_message: log message to extract parameters from 221 | :param exact_matching: whether to apply the correct masking-patterns to match parameters, or try to approximate; 222 | disabling exact_matching may be faster but may lead to situations in which parameters 223 | are wrongly identified. 224 | :return: A ordered list of ExtractedParameter for the log message 225 | or None if log_message does not correspond to log_template. 226 | """ 227 | 228 | for delimiter in self.config.drain_extra_delimiters: 229 | log_message = re.sub(delimiter, " ", log_message) 230 | 231 | template_regex, param_group_name_to_mask_name = self._get_template_parameter_extraction_regex( 232 | log_template, exact_matching) 233 | 234 | # Parameters are represented by specific named groups inside template_regex. 235 | parameter_match = re.match(template_regex, log_message) 236 | 237 | # log template does not match template 238 | if not parameter_match: 239 | return None 240 | 241 | # create list of extracted parameters 242 | extracted_parameters = [] 243 | for group_name, parameter in parameter_match.groupdict().items(): 244 | if group_name in param_group_name_to_mask_name: 245 | mask_name = param_group_name_to_mask_name[group_name] 246 | extracted_parameter = ExtractedParameter(parameter, mask_name) 247 | extracted_parameters.append(extracted_parameter) 248 | 249 | return extracted_parameters 250 | 251 | @cachedmethod(lambda self: self.parameter_extraction_cache) 252 | def _get_template_parameter_extraction_regex(self, 253 | log_template: str, 254 | exact_matching: bool) -> Tuple[str, Mapping[str, str]]: 255 | param_group_name_to_mask_name = {} 256 | param_name_counter = [0] 257 | 258 | def get_next_param_name() -> str: 259 | param_group_name = f"p_{str(param_name_counter[0])}" 260 | param_name_counter[0] += 1 261 | return param_group_name 262 | 263 | # Create a named group with the respective patterns for the given mask-name. 264 | def create_capture_regex(_mask_name: str) -> str: 265 | allowed_patterns = [] 266 | if exact_matching: 267 | # get all possible regex patterns from masking instructions that match this mask name 268 | masking_instructions = self.masker.instructions_by_mask_name(_mask_name) 269 | for mi in masking_instructions: 270 | # MaskingInstruction may already contain named groups. 271 | # We replace group names in those named groups, to avoid conflicts due to duplicate names. 272 | if hasattr(mi, 'regex') and hasattr(mi, 'pattern'): 273 | mi_groups = mi.regex.groupindex.keys() 274 | pattern: str = mi.pattern 275 | else: 276 | # non regex masking instructions - support only non-exact matching 277 | mi_groups = [] 278 | pattern = ".+?" 279 | 280 | for group_name in mi_groups: 281 | param_group_name = get_next_param_name() 282 | 283 | def replace_captured_param_name(param_pattern: str) -> str: 284 | _search_str = param_pattern.format(group_name) 285 | _replace_str = param_pattern.format(param_group_name) 286 | return pattern.replace(_search_str, _replace_str) 287 | 288 | pattern = replace_captured_param_name("(?P={}") 289 | pattern = replace_captured_param_name("(?P<{}>") 290 | 291 | # support unnamed back-references in masks (simple cases only) 292 | pattern = re.sub(r"\\(?!0)\d{1,2}", r"(?:.+?)", pattern) 293 | allowed_patterns.append(pattern) 294 | 295 | if not exact_matching or _mask_name == "*": 296 | allowed_patterns.append(r".+?") 297 | 298 | # Give each capture group a unique name to avoid conflicts. 299 | param_group_name = get_next_param_name() 300 | param_group_name_to_mask_name[param_group_name] = _mask_name 301 | joined_patterns = "|".join(allowed_patterns) 302 | capture_regex = f"(?P<{param_group_name}>{joined_patterns})" 303 | return capture_regex 304 | 305 | # For every mask in the template, replace it with a named group of all 306 | # possible masking-patterns it could represent (in order). 307 | mask_names = set(self.masker.mask_names) 308 | 309 | # the Drain catch-all mask 310 | mask_names.add("*") 311 | 312 | escaped_prefix = re.escape(self.masker.mask_prefix) 313 | escaped_suffix = re.escape(self.masker.mask_suffix) 314 | template_regex = re.escape(log_template) 315 | 316 | # replace each mask name with a proper regex that captures it 317 | for mask_name in mask_names: 318 | search_str = escaped_prefix + re.escape(mask_name) + escaped_suffix 319 | while True: 320 | rep_str = create_capture_regex(mask_name) 321 | # Replace one-by-one to get a new param group name for each replacement. 322 | template_regex_new = template_regex.replace(search_str, rep_str, 1) 323 | # Break when all replaces for this mask are done. 324 | if template_regex_new == template_regex: 325 | break 326 | template_regex = template_regex_new 327 | 328 | # match also messages with multiple spaces or other whitespace chars between tokens 329 | template_regex = re.sub(r"\\ ", r"\\s+", template_regex) 330 | template_regex = f"^{template_regex}$" 331 | return template_regex, param_group_name_to_mask_name 332 | -------------------------------------------------------------------------------- /drain3/drain.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | # This file implements the Drain algorithm for log parsing. 3 | # Based on https://github.com/logpai/logparser/blob/master/logparser/Drain/Drain.py by LogPAI team 4 | 5 | from abc import ABC, abstractmethod 6 | from typing import cast, Collection, IO, Iterable, MutableMapping, MutableSequence, Optional, Sequence, Tuple, \ 7 | TYPE_CHECKING, TypeVar, Union 8 | 9 | from cachetools import LRUCache, Cache 10 | 11 | from drain3.simple_profiler import Profiler, NullProfiler 12 | 13 | 14 | class LogCluster: 15 | __slots__ = ["log_template_tokens", "cluster_id", "size"] 16 | 17 | def __init__(self, log_template_tokens: Iterable[str], cluster_id: int) -> None: 18 | self.log_template_tokens = tuple(log_template_tokens) 19 | self.cluster_id = cluster_id 20 | self.size = 1 21 | 22 | def get_template(self) -> str: 23 | return ' '.join(self.log_template_tokens) 24 | 25 | def __str__(self) -> str: 26 | return f"ID={str(self.cluster_id).ljust(5)} : size={str(self.size).ljust(10)}: {self.get_template()}" 27 | 28 | 29 | _T = TypeVar("_T") 30 | if TYPE_CHECKING: 31 | class _LRUCache(LRUCache[int, Optional[LogCluster]]): 32 | # see https://github.com/python/mypy/issues/4148 for this hack 33 | ... 34 | else: 35 | _LRUCache = LRUCache 36 | 37 | class LogClusterCache(_LRUCache): 38 | """ 39 | Least Recently Used (LRU) cache which allows callers to conditionally skip 40 | cache eviction algorithm when accessing elements. 41 | """ 42 | 43 | def __missing__(self, key: int) -> None: 44 | return None 45 | 46 | def get(self, key: int, _: Union[Optional[LogCluster], _T] = None) -> Optional[LogCluster]: 47 | """ 48 | Returns the value of the item with the specified key without updating 49 | the cache eviction algorithm. 50 | """ 51 | return Cache.__getitem__(self, key) 52 | 53 | 54 | class Node: 55 | __slots__ = ["key_to_child_node", "cluster_ids"] 56 | 57 | def __init__(self) -> None: 58 | self.key_to_child_node: MutableMapping[str, Node] = {} 59 | self.cluster_ids: Sequence[int] = [] 60 | 61 | 62 | class DrainBase(ABC): 63 | def __init__(self, 64 | depth: int = 4, 65 | sim_th: float = 0.4, 66 | max_children: int = 100, 67 | max_clusters: Optional[int] = None, 68 | extra_delimiters: Sequence[str] = (), 69 | profiler: Profiler = NullProfiler(), 70 | param_str: str = "<*>", 71 | parametrize_numeric_tokens: bool = True) -> None: 72 | """ 73 | Create a new Drain instance. 74 | 75 | :param depth: max depth levels of log clusters. Minimum is 3. 76 | For example, for depth==4, Root is considered depth level 1. 77 | Token count is considered depth level 2. 78 | First log token is considered depth level 3. 79 | Log clusters below first token node are considered depth level 4. 80 | :param sim_th: similarity threshold - if percentage of similar tokens for a log message is below this 81 | number, a new log cluster will be created. 82 | :param max_children: max number of children of an internal node 83 | :param max_clusters: max number of tracked clusters (unlimited by default). 84 | When this number is reached, model starts replacing old clusters 85 | with a new ones according to the LRU policy. 86 | :param extra_delimiters: delimiters to apply when splitting log message into words (in addition to whitespace). 87 | :param parametrize_numeric_tokens: whether to treat tokens that contains at least one digit 88 | as template parameters. 89 | """ 90 | if depth < 3: 91 | raise ValueError("depth argument must be at least 3") 92 | 93 | self.log_cluster_depth = depth 94 | self.max_node_depth = depth - 2 # max depth of a prefix tree node, starting from zero 95 | self.sim_th = sim_th 96 | self.max_children = max_children 97 | self.root_node = Node() 98 | self.profiler = profiler 99 | self.extra_delimiters = extra_delimiters 100 | self.max_clusters = max_clusters 101 | self.param_str = param_str 102 | self.parametrize_numeric_tokens = parametrize_numeric_tokens 103 | 104 | self.id_to_cluster: MutableMapping[int, Optional[LogCluster]] = \ 105 | {} if max_clusters is None else LogClusterCache(maxsize=max_clusters) 106 | self.clusters_counter = 0 107 | 108 | @property 109 | def clusters(self) -> Collection[LogCluster]: 110 | return cast(Collection[LogCluster], self.id_to_cluster.values()) 111 | 112 | @staticmethod 113 | def has_numbers(s: Iterable[str]) -> bool: 114 | return any(char.isdigit() for char in s) 115 | 116 | def fast_match(self, 117 | cluster_ids: Collection[int], 118 | tokens: Sequence[str], 119 | sim_th: float, 120 | include_params: bool) -> Optional[LogCluster]: 121 | """ 122 | Find the best match for a log message (represented as tokens) versus a list of clusters 123 | :param cluster_ids: List of clusters to match against (represented by their IDs) 124 | :param tokens: the log message, separated to tokens. 125 | :param sim_th: minimum required similarity threshold (None will be returned in no clusters reached it) 126 | :param include_params: consider tokens matched to wildcard parameters in similarity threshold. 127 | :return: Best match cluster or None 128 | """ 129 | match_cluster = None 130 | 131 | max_sim: Union[int, float] = -1 132 | max_param_count = -1 133 | max_cluster = None 134 | 135 | for cluster_id in cluster_ids: 136 | # Try to retrieve cluster from cache with bypassing eviction 137 | # algorithm as we are only testing candidates for a match. 138 | cluster = self.id_to_cluster.get(cluster_id) 139 | if cluster is None: 140 | continue 141 | cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, tokens, include_params) 142 | if cur_sim > max_sim or (cur_sim == max_sim and param_count > max_param_count): 143 | max_sim = cur_sim 144 | max_param_count = param_count 145 | max_cluster = cluster 146 | 147 | if max_sim >= sim_th: 148 | match_cluster = max_cluster 149 | 150 | return match_cluster 151 | 152 | def print_tree(self, file: Optional[IO[str]] = None, max_clusters: int = 5) -> None: 153 | self.print_node("root", self.root_node, 0, file, max_clusters) 154 | 155 | def print_node(self, token: str, node: Node, depth: int, file: Optional[IO[str]], max_clusters: int) -> None: 156 | out_str = '\t' * depth 157 | 158 | if depth == 0: 159 | out_str += f'<{token}>' 160 | elif depth == 1: 161 | if token.isdigit(): 162 | out_str += f'' 163 | else: 164 | out_str += f'<{token}>' 165 | else: 166 | out_str += f'"{token}"' 167 | 168 | if len(node.cluster_ids) > 0: 169 | out_str += f" (cluster_count={len(node.cluster_ids)})" 170 | 171 | print(out_str, file=file) 172 | 173 | for token, child in node.key_to_child_node.items(): 174 | self.print_node(token, child, depth + 1, file, max_clusters) 175 | 176 | for cid in node.cluster_ids[:max_clusters]: 177 | cluster = self.id_to_cluster[cid] 178 | out_str = '\t' * (depth + 1) + str(cluster) 179 | print(out_str, file=file) 180 | 181 | def get_content_as_tokens(self, content: str) -> Sequence[str]: 182 | content = content.strip() 183 | for delimiter in self.extra_delimiters: 184 | content = content.replace(delimiter, " ") 185 | content_tokens = content.split() 186 | return content_tokens 187 | 188 | def add_log_message(self, content: str) -> Tuple[LogCluster, str]: 189 | content_tokens = self.get_content_as_tokens(content) 190 | 191 | if self.profiler: 192 | self.profiler.start_section("tree_search") 193 | match_cluster = self.tree_search(self.root_node, content_tokens, self.sim_th, False) 194 | if self.profiler: 195 | self.profiler.end_section() 196 | 197 | # Match no existing log cluster 198 | if match_cluster is None: 199 | if self.profiler: 200 | self.profiler.start_section("create_cluster") 201 | self.clusters_counter += 1 202 | cluster_id = self.clusters_counter 203 | match_cluster = LogCluster(content_tokens, cluster_id) 204 | self.id_to_cluster[cluster_id] = match_cluster 205 | self.add_seq_to_prefix_tree(self.root_node, match_cluster) 206 | update_type = "cluster_created" 207 | 208 | # Add the new log message to the existing cluster 209 | else: 210 | if self.profiler: 211 | self.profiler.start_section("cluster_exist") 212 | new_template_tokens = self.create_template(content_tokens, match_cluster.log_template_tokens) 213 | if tuple(new_template_tokens) == match_cluster.log_template_tokens: 214 | update_type = "none" 215 | else: 216 | match_cluster.log_template_tokens = tuple(new_template_tokens) 217 | update_type = "cluster_template_changed" 218 | match_cluster.size += 1 219 | # Touch cluster to update its state in the cache. 220 | # noinspection PyStatementEffect 221 | self.id_to_cluster[match_cluster.cluster_id] 222 | 223 | if self.profiler: 224 | self.profiler.end_section() 225 | 226 | return match_cluster, update_type 227 | 228 | def get_total_cluster_size(self) -> int: 229 | size = 0 230 | for c in self.id_to_cluster.values(): 231 | size += cast(LogCluster, c).size 232 | return size 233 | 234 | def get_clusters_ids_for_seq_len(self, seq_fir: Union[int, str]) -> Collection[int]: 235 | """ 236 | seq_fir: int/str - the first token of the sequence 237 | Return all clusters with the specified count of tokens 238 | """ 239 | 240 | def append_clusters_recursive(node: Node, id_list_to_fill: MutableSequence[int]) -> None: 241 | id_list_to_fill.extend(node.cluster_ids) 242 | for child_node in node.key_to_child_node.values(): 243 | append_clusters_recursive(child_node, id_list_to_fill) 244 | 245 | cur_node = self.root_node.key_to_child_node.get(str(seq_fir)) 246 | 247 | # no template with same token count 248 | if cur_node is None: 249 | return [] 250 | 251 | target: MutableSequence[int] = [] 252 | append_clusters_recursive(cur_node, target) 253 | return target 254 | 255 | @abstractmethod 256 | def tree_search(self, 257 | root_node: Node, 258 | tokens: Sequence[str], 259 | sim_th: float, 260 | include_params: bool) -> Optional[LogCluster]: 261 | ... 262 | 263 | @abstractmethod 264 | def add_seq_to_prefix_tree(self, root_node: Node, cluster: LogCluster) -> None: 265 | ... 266 | 267 | @abstractmethod 268 | def get_seq_distance(self, seq1: Sequence[str], seq2: Sequence[str], include_params: bool) -> Tuple[float, int]: 269 | ... 270 | 271 | @abstractmethod 272 | def create_template(self, seq1: Sequence[str], seq2: Sequence[str]) -> Sequence[str]: 273 | ... 274 | 275 | @abstractmethod 276 | def match(self, content: str, full_search_strategy: str = "never") -> Optional[LogCluster]: 277 | ... 278 | 279 | 280 | class Drain(DrainBase): 281 | 282 | def tree_search(self, 283 | root_node: Node, 284 | tokens: Sequence[str], 285 | sim_th: float, 286 | include_params: bool) -> Optional[LogCluster]: 287 | 288 | # at first level, children are grouped by token (word) count 289 | token_count = len(tokens) 290 | cur_node = root_node.key_to_child_node.get(str(token_count)) 291 | 292 | # no template with same token count yet 293 | if cur_node is None: 294 | return None 295 | 296 | # handle case of empty log string - return the single cluster in that group 297 | if token_count == 0: 298 | return self.id_to_cluster.get(cur_node.cluster_ids[0]) 299 | 300 | # find the leaf node for this log - a path of nodes matching the first N tokens (N=tree depth) 301 | cur_node_depth = 1 302 | for token in tokens: 303 | # at max depth 304 | if cur_node_depth >= self.max_node_depth: 305 | break 306 | 307 | # this is last token 308 | if cur_node_depth == token_count: 309 | break 310 | 311 | key_to_child_node = cur_node.key_to_child_node 312 | cur_node = key_to_child_node.get(token) 313 | if cur_node is None: # no exact next token exist, try wildcard node 314 | cur_node = key_to_child_node.get(self.param_str) 315 | if cur_node is None: # no wildcard node exist 316 | return None 317 | 318 | cur_node_depth += 1 319 | 320 | # get best match among all clusters with same prefix, or None if no match is above sim_th 321 | cluster = self.fast_match(cur_node.cluster_ids, tokens, sim_th, include_params) 322 | return cluster 323 | 324 | def add_seq_to_prefix_tree(self, root_node: Node, cluster: LogCluster) -> None: 325 | token_count = len(cluster.log_template_tokens) 326 | token_count_str = str(token_count) 327 | if token_count_str not in root_node.key_to_child_node: 328 | first_layer_node = Node() 329 | root_node.key_to_child_node[token_count_str] = first_layer_node 330 | else: 331 | first_layer_node = root_node.key_to_child_node[token_count_str] 332 | 333 | cur_node = first_layer_node 334 | 335 | # handle case of empty log string 336 | if token_count == 0: 337 | cur_node.cluster_ids = [cluster.cluster_id] 338 | return 339 | 340 | current_depth = 1 341 | for token in cluster.log_template_tokens: 342 | 343 | # if at max depth or this is last token in template - add current log cluster to the leaf node 344 | if current_depth >= self.max_node_depth or current_depth >= token_count: 345 | # clean up stale clusters before adding a new one. 346 | new_cluster_ids = [] 347 | for cluster_id in cur_node.cluster_ids: 348 | if cluster_id in self.id_to_cluster: 349 | new_cluster_ids.append(cluster_id) 350 | new_cluster_ids.append(cluster.cluster_id) 351 | cur_node.cluster_ids = new_cluster_ids 352 | break 353 | 354 | # if token not matched in this layer of existing tree. 355 | if token not in cur_node.key_to_child_node: 356 | if self.parametrize_numeric_tokens and self.has_numbers(token): 357 | if self.param_str not in cur_node.key_to_child_node: 358 | new_node = Node() 359 | cur_node.key_to_child_node[self.param_str] = new_node 360 | cur_node = new_node 361 | else: 362 | cur_node = cur_node.key_to_child_node[self.param_str] 363 | 364 | else: 365 | if self.param_str in cur_node.key_to_child_node: 366 | if len(cur_node.key_to_child_node) < self.max_children: 367 | new_node = Node() 368 | cur_node.key_to_child_node[token] = new_node 369 | cur_node = new_node 370 | else: 371 | cur_node = cur_node.key_to_child_node[self.param_str] 372 | else: 373 | if len(cur_node.key_to_child_node) + 1 < self.max_children: 374 | new_node = Node() 375 | cur_node.key_to_child_node[token] = new_node 376 | cur_node = new_node 377 | elif len(cur_node.key_to_child_node) + 1 == self.max_children: 378 | new_node = Node() 379 | cur_node.key_to_child_node[self.param_str] = new_node 380 | cur_node = new_node 381 | else: 382 | cur_node = cur_node.key_to_child_node[self.param_str] 383 | 384 | # if the token is matched 385 | else: 386 | cur_node = cur_node.key_to_child_node[token] 387 | 388 | current_depth += 1 389 | 390 | # seq1 is a template, seq2 is the log to match 391 | def get_seq_distance(self, seq1: Sequence[str], seq2: Sequence[str], include_params: bool) -> Tuple[float, int]: 392 | assert len(seq1) == len(seq2) 393 | 394 | # sequences are empty - full match 395 | if len(seq1) == 0: 396 | return 1.0, 0 397 | 398 | sim_tokens = 0 399 | param_count = 0 400 | 401 | for token1, token2 in zip(seq1, seq2): 402 | if token1 == self.param_str: 403 | param_count += 1 404 | continue 405 | if token1 == token2: 406 | sim_tokens += 1 407 | 408 | if include_params: 409 | sim_tokens += param_count 410 | 411 | ret_val = float(sim_tokens) / len(seq1) 412 | 413 | return ret_val, param_count 414 | 415 | def create_template(self, seq1: Sequence[str], seq2: Sequence[str]) -> Sequence[str]: 416 | """ 417 | Loop through two sequences and create a template sequence that 418 | replaces unmatched tokens with the parameter string. 419 | 420 | :param seq1: first sequence 421 | :param seq2: second sequence 422 | :return: template sequence with param_str in place of unmatched tokens 423 | """ 424 | assert len(seq1) == len(seq2) 425 | return [token2 if token1 == token2 else self.param_str for token1, token2 in zip(seq1, seq2)] 426 | 427 | def match(self, content: str, full_search_strategy: str = "never") -> Optional[LogCluster]: 428 | """ 429 | Match log message against an already existing cluster. 430 | Match shall be perfect (sim_th=1.0). 431 | New cluster will not be created as a result of this call, nor any cluster modifications. 432 | 433 | :param content: log message to match 434 | :param full_search_strategy: when to perform full cluster search. 435 | (1) "never" is the fastest, will always perform a tree search [O(log(n)] but might produce 436 | false negatives (wrong mismatches) on some edge cases; 437 | (2) "fallback" will perform a linear search [O(n)] among all clusters with the same token count, but only in 438 | case tree search found no match. 439 | It should not have false negatives, however tree-search may find a non-optimal match with 440 | more wildcard parameters than necessary; 441 | (3) "always" is the slowest. It will select the best match among all known clusters, by always evaluating 442 | all clusters with the same token count, and selecting the cluster with perfect all token match and least 443 | count of wildcard matches. 444 | :return: Matched cluster or None if no match found. 445 | """ 446 | 447 | assert full_search_strategy in ["always", "never", "fallback"] 448 | 449 | required_sim_th = 1.0 450 | content_tokens = self.get_content_as_tokens(content) 451 | 452 | # consider for future improvement: 453 | # It is possible to implement a recursive tree_search (first try exact token match and fallback to 454 | # wildcard match). This will be both accurate and more efficient than the linear full search 455 | # also fast match can be optimized when exact match is required by early 456 | # quitting on less than exact cluster matches. 457 | def full_search() -> Optional[LogCluster]: 458 | all_ids = self.get_clusters_ids_for_seq_len(len(content_tokens)) 459 | cluster = self.fast_match(all_ids, content_tokens, required_sim_th, include_params=True) 460 | return cluster 461 | 462 | if full_search_strategy == "always": 463 | return full_search() 464 | 465 | match_cluster = self.tree_search(self.root_node, content_tokens, required_sim_th, include_params=True) 466 | if match_cluster is not None: 467 | return match_cluster 468 | 469 | if full_search_strategy == "never": 470 | return None 471 | 472 | return full_search() 473 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Drain3 2 | 3 | ## Important Update 4 | 5 | Drain3 was moved to the `logpai` GitHub organization (which is also the home for the original Drain implementation). We always welcome more contributors and maintainers to join us and push the project forward. We welcome more contributions and variants of implementations if you find practical enhancements to the algorithm in production scenarios. 6 | 7 | ## Introduction 8 | 9 | Drain3 is an online log template miner that can extract templates (clusters) from a stream of log messages in a timely 10 | manner. It employs a parse tree with fixed depth to guide the log group search process, which effectively avoids 11 | constructing a very deep and unbalanced tree. 12 | 13 | Drain3 continuously learns on-the-fly and extracts log templates from raw log entries. 14 | 15 | #### Example: 16 | 17 | For the input: 18 | 19 | ``` 20 | connected to 10.0.0.1 21 | connected to 192.168.0.1 22 | Hex number 0xDEADBEAF 23 | user davidoh logged in 24 | user eranr logged in 25 | ``` 26 | 27 | Drain3 extracts the following templates: 28 | 29 | ``` 30 | ID=1 : size=2 : connected to <:IP:> 31 | ID=2 : size=1 : Hex number <:HEX:> 32 | ID=3 : size=2 : user <:*:> logged in 33 | ``` 34 | 35 | Full sample program output: 36 | 37 | ``` 38 | Starting Drain3 template miner 39 | Checking for saved state 40 | Saved state not found 41 | Drain3 started with 'FILE' persistence 42 | Starting training mode. Reading from std-in ('q' to finish) 43 | > connected to 10.0.0.1 44 | Saving state of 1 clusters with 1 messages, 528 bytes, reason: cluster_created (1) 45 | {"change_type": "cluster_created", "cluster_id": 1, "cluster_size": 1, "template_mined": "connected to <:IP:>", "cluster_count": 1} 46 | Parameters: [ExtractedParameter(value='10.0.0.1', mask_name='IP')] 47 | > connected to 192.168.0.1 48 | {"change_type": "none", "cluster_id": 1, "cluster_size": 2, "template_mined": "connected to <:IP:>", "cluster_count": 1} 49 | Parameters: [ExtractedParameter(value='192.168.0.1', mask_name='IP')] 50 | > Hex number 0xDEADBEAF 51 | Saving state of 2 clusters with 3 messages, 584 bytes, reason: cluster_created (2) 52 | {"change_type": "cluster_created", "cluster_id": 2, "cluster_size": 1, "template_mined": "Hex number <:HEX:>", "cluster_count": 2} 53 | Parameters: [ExtractedParameter(value='0xDEADBEAF', mask_name='HEX')] 54 | > user davidoh logged in 55 | Saving state of 3 clusters with 4 messages, 648 bytes, reason: cluster_created (3) 56 | {"change_type": "cluster_created", "cluster_id": 3, "cluster_size": 1, "template_mined": "user davidoh logged in", "cluster_count": 3} 57 | Parameters: [] 58 | > user eranr logged in 59 | Saving state of 3 clusters with 5 messages, 644 bytes, reason: cluster_template_changed (3) 60 | {"change_type": "cluster_template_changed", "cluster_id": 3, "cluster_size": 2, "template_mined": "user <:*:> logged in", "cluster_count": 3} 61 | Parameters: [ExtractedParameter(value='eranr', mask_name='*')] 62 | > q 63 | Training done. Mined clusters: 64 | ID=1 : size=2 : connected to <:IP:> 65 | ID=2 : size=1 : Hex number <:HEX:> 66 | ID=3 : size=2 : user <:*:> logged in 67 | ``` 68 | 69 | This project is an upgrade of the original [Drain](https://github.com/logpai/logparser/blob/master/logparser/Drain) 70 | project by LogPAI from Python 2.7 to Python 3.6 or later with additional features and bug-fixes. 71 | 72 | Read more information about Drain from the following paper: 73 | 74 | - Pinjia He, Jieming Zhu, Zibin Zheng, and Michael R. 75 | Lyu. [Drain: An Online Log Parsing Approach with Fixed Depth Tree](http://jiemingzhu.github.io/pub/pjhe_icws2017.pdf), 76 | Proceedings of the 24th International Conference on Web Services (ICWS), 2017. 77 | 78 | A Drain3 use case is presented in this blog 79 | post: [Use open source Drain3 log-template mining project to monitor for network outages](https://developer.ibm.com/blogs/how-mining-log-templates-can-help-ai-ops-in-cloud-scale-data-centers) 80 | . 81 | 82 | #### New features 83 | 84 | - [**Persistence**](#persistence). Save and load Drain state into an [Apache Kafka](https://kafka.apache.org) 85 | topic, [Redis](https://redis.io/) or a file. 86 | - **Streaming**. Support feeding Drain with messages one-be-one. 87 | - [**Masking**](#masking). Replace some message parts (e.g numbers, IPs, emails) with wildcards. This improves the 88 | accuracy of template mining. 89 | - [**Packaging**](#installation). As a pip package. 90 | - [**Configuration**](#configuration). Support for configuring Drain3 using an `.ini` file or a configuration object. 91 | - [**Memory efficiency**](#memory-efficiency). Decrease the memory footprint of internal data structures and introduce 92 | cache to control max memory consumed (thanks to @StanislawSwierc) 93 | - [**Inference mode**](#training-vs-inference-modes). In case you want to separate training and inference phase, Drain3 94 | provides a function for *fast* matching against already-learned clusters (templates) only, without the usage of 95 | regular expressions. 96 | - [**Parameter extraction**](#parameter-extraction). Accurate extraction of the variable parts from a log message as an 97 | ordered list, based on its mined template and the defined masking instructions (thanks to @Impelon). 98 | 99 | #### Expected Input and Output 100 | 101 | Although Drain3 can be ingested with full raw log message, template mining accuracy can be improved if you feed it with 102 | only the unstructured free-text portion of log messages, by first removing structured parts like timestamp, hostname. 103 | severity, etc. 104 | 105 | The output is a dictionary with the following fields: 106 | 107 | - `change_type` - indicates either if a new template was identified, an existing template was changed or message added 108 | to an existing cluster. 109 | - `cluster_id` - Sequential ID of the cluster that the log belongs to. 110 | - `cluster_size`- The size (message count) of the cluster that the log belongs to. 111 | - `cluster_count` - Count clusters seen so far. 112 | - `template_mined`- the last template of above cluster_id. 113 | 114 | ## Configuration 115 | 116 | Drain3 is configured using [configparser](https://docs.python.org/3.4/library/configparser.html). By default, config 117 | filename is `drain3.ini` in working directory. It can also be configured passing 118 | a [TemplateMinerConfig](drain3/template_miner_config.py) object to the [TemplateMiner](drain3/template_miner.py) 119 | constructor. 120 | 121 | Primary configuration parameters: 122 | 123 | - `[DRAIN]/sim_th` - similarity threshold. if percentage of similar tokens for a log message is below this number, a new 124 | log cluster will be created (default 0.4) 125 | - `[DRAIN]/depth` - max depth levels of log clusters. Minimum is 3. (default 4) 126 | - `[DRAIN]/max_children` - max number of children of an internal node (default 100) 127 | - `[DRAIN]/max_clusters` - max number of tracked clusters (unlimited by default). When this number is reached, model 128 | starts replacing old clusters with a new ones according to the LRU cache eviction policy. 129 | - `[DRAIN]/extra_delimiters` - delimiters to apply when splitting log message into words (in addition to whitespace) ( 130 | default none). Format is a Python list e.g. `['_', ':']`. 131 | - `[MASKING]/masking` - parameters masking - in json format (default "") 132 | - `[MASKING]/mask_prefix` & `[MASKING]/mask_suffix` - the wrapping of identified parameters in templates. By default, it 133 | is `<` and `>` respectively. 134 | - `[SNAPSHOT]/snapshot_interval_minutes` - time interval for new snapshots (default 1) 135 | - `[SNAPSHOT]/compress_state` - whether to compress the state before saving it. This can be useful when using Kafka 136 | persistence. 137 | 138 | ## Masking 139 | 140 | This feature allows masking of specific variable parts in log message with keywords, prior to passing to Drain. A 141 | well-defined masking can improve template mining accuracy. 142 | 143 | Template parameters that do not match any custom mask in the preliminary masking phase are replaced with `<*>` by Drain 144 | core. 145 | 146 | Use a list of regular expressions in the configuration file with the format `{'regex_pattern', 'mask_with'}` to set 147 | custom masking. 148 | 149 | For example, following masking instructions in `drain3.ini` will mask IP addresses and integers: 150 | 151 | ``` 152 | [MASKING] 153 | masking = [ 154 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"}, 155 | {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"}, 156 | ] 157 | ] 158 | ``` 159 | 160 | ## Persistence 161 | 162 | The persistence feature saves and loads a snapshot of Drain3 state in a (compressed) json format. This feature adds 163 | restart resiliency to Drain allowing continuation of activity and maintain learned knowledge across restarts. 164 | 165 | Drain3 state includes the search tree and all the clusters that were identified up until snapshot time. 166 | 167 | The snapshot also persist number of log messages matched each cluster, and it's `cluster_id`. 168 | 169 | An example of a snapshot: 170 | 171 | ```json 172 | { 173 | "clusters": [ 174 | { 175 | "cluster_id": 1, 176 | "log_template_tokens": [ 177 | "aa", 178 | "aa", 179 | "<*>" 180 | ], 181 | "py/object": "drain3_core.LogCluster", 182 | "size": 2 183 | }, 184 | { 185 | "cluster_id": 2, 186 | "log_template_tokens": [ 187 | "My", 188 | "IP", 189 | "is", 190 | "" 191 | ], 192 | "py/object": "drain3_core.LogCluster", 193 | "size": 1 194 | } 195 | ] 196 | } 197 | ``` 198 | 199 | This example snapshot persist two clusters with the templates: 200 | 201 | `["aa", "aa", "<*>"]` - occurs twice 202 | 203 | `["My", "IP", "is", ""]` - occurs once 204 | 205 | Snapshots are created in the following events: 206 | 207 | - `cluster_created` - in any new template 208 | - `cluster_template_changed` - in any update of a template 209 | - `periodic` - after n minutes from the last snapshot. This is intended to save cluster sizes even if no new template 210 | was identified. 211 | 212 | Drain3 currently supports the following persistence modes: 213 | 214 | - **Kafka** - The snapshot is saved in a dedicated topic used only for snapshots - the last message in this topic is the 215 | last snapshot that will be loaded after restart. For Kafka persistence, you need to provide: `topic_name`. You may 216 | also provide other `kwargs` that are supported by `kafka.KafkaConsumer` and `kafka.Producer` e.g `bootstrap_servers` 217 | to change Kafka endpoint (default is `localhost:9092`). 218 | 219 | - **Redis** - The snapshot is saved to a key in Redis database (contributed by @matabares). 220 | 221 | - **File** - The snapshot is saved to a file. 222 | 223 | - **Memory** - The snapshot is saved an in-memory object. 224 | 225 | - **None** - No persistence. 226 | 227 | Drain3 persistence modes can be easily extended to another medium / database by inheriting 228 | the [PersistenceHandler](drain3/persistence_handler.py) class. 229 | 230 | ## Training vs. Inference modes 231 | 232 | In some use-cases, it is required to separate training and inference phases. 233 | 234 | In training phase you should call `template_miner.add_log_message(log_line)`. This will match log line against an 235 | existing cluster (if similarity is above threshold) or create a new cluster. It may also change the template of an 236 | existing cluster. 237 | 238 | In inference mode you should call `template_miner.match(log_line)`. This will match log line against previously learned 239 | clusters only. No new clusters are created and templates of existing clusters are not changed. Match to existing cluster 240 | has to be perfect, otherwise `None` is returned. You can use persistence option to load previously trained clusters 241 | before inference. 242 | 243 | ## Memory efficiency 244 | 245 | This feature limits the max memory used by the model. It is particularly important for large and possibly unbounded log 246 | streams. This feature is controlled by the `max_clusters​` parameter, which sets the max number of clusters/templates 247 | trarcked by the model. When the limit is reached, new templates start to replace the old ones according to the Least 248 | Recently Used (LRU) eviction policy. This makes the model adapt quickly to the most recent templates in the log stream. 249 | 250 | ## Parameter Extraction 251 | 252 | Drain3 supports retrieving an ordered list of variables in a log message, after its template was mined. Each parameter 253 | is accompanied by the name of the mask that was matched, or `*` for the catch-all mask. 254 | 255 | Parameter extraction is performed by generating a regular expression that matches the template and then applying it on 256 | the log message. When `exact_matching` is enabled (by default), the generated regex included the regular expression 257 | defined in relevant masking instructions. If there are multiple masking instructions with the same name, either match 258 | can satisfy the regex. It is possible to disable exact matching so that every variable is matched against a 259 | non-whitespace character sequence. This may improve performance on expanse of accuracy. 260 | 261 | Parameter extraction regexes generated per template are cached by default, to improve performance. You can control cache 262 | size with the ` MASKING/parameter_extraction_cache_capacity` configuration parameter. 263 | 264 | Sample usage: 265 | 266 | ```python 267 | result = template_miner.add_log_message(log_line) 268 | params = template_miner.extract_parameters( 269 | result["template_mined"], log_line, exact_matching=True) 270 | ``` 271 | 272 | For the input `"user johndoe logged in 11 minuts ago"`, the template would be: 273 | 274 | ``` 275 | "user <:*:> logged in <:NUM:> minuts ago" 276 | ``` 277 | 278 | ... and the extracted parameters: 279 | 280 | ``` 281 | [ 282 | ExtractedParameter(value='johndoe', mask_name='*'), 283 | ExtractedParameter(value='11', mask_name='NUM') 284 | ] 285 | ``` 286 | 287 | ## Installation 288 | 289 | Drain3 is available from [PyPI](https://pypi.org/project/drain3). To install use `pip`: 290 | 291 | ``` 292 | pip3 install drain3 293 | ``` 294 | 295 | Note: If you decide to use Kafka or Redis persistence, you should install relevant client library explicitly, since it 296 | is declared as an extra (optional) dependency, by either: 297 | 298 | ``` 299 | pip3 install kafka-python 300 | ``` 301 | 302 | -- or -- 303 | 304 | ``` 305 | pip3 install redis 306 | ``` 307 | 308 | ## Examples 309 | 310 | In order to run the examples directly from the repository, you need to install dependencies. You can do that using * 311 | pipenv* by executing the following command (assuming pipenv already installed): 312 | 313 | ```shell 314 | python3 -m pipenv sync 315 | ``` 316 | 317 | #### Example 1 - `drain_stdin_demo` 318 | 319 | Run [examples/drain_stdin_demo.py](examples/drain_stdin_demo.py) from the root folder of the repository by: 320 | 321 | ``` 322 | python3 -m pipenv run python -m examples.drain_stdin_demo 323 | ``` 324 | 325 | This example uses Drain3 on input from stdin and persist to either Kafka / file / no persistence. 326 | 327 | Change `persistence_type` variable in the example to change persistence mode. 328 | 329 | Enter several log lines using the command line. Press `q` to end online learn-and-match mode. 330 | 331 | Next, demo goes to match (inference) only mode, in which no new clusters are trained and input is matched against 332 | previously trained clusters only. Press `q` again to finish execution. 333 | 334 | #### Example 2 - `drain_bigfile_demo` 335 | 336 | Run [examples/drain_bigfile_demo](examples/drain_bigfile_demo.py) from the root folder of the repository by: 337 | 338 | ``` 339 | python3 -m pipenv run python -m examples.drain_bigfile_demo 340 | ``` 341 | 342 | This example downloads a real-world log file (of an SSH server) and process all lines, then prints result clusters, 343 | prefix tree and performance statistics. 344 | 345 | #### Sample config file 346 | 347 | An example `drain3.ini` file with masking instructions can be found in the [examples](examples) folder as well. 348 | 349 | ## Contributing 350 | 351 | Our project welcomes external contributions. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for further details. 352 | 353 | ## Change Log 354 | 355 | ##### v0.9.11 356 | 357 | * Fixed possible DivideByZero error when the profiler is enabled - [Issue #65](https://github.com/IBM/Drain3/issues/65). 358 | 359 | ##### v0.9.10 360 | 361 | * Fixed compatibility issue with Python 3.10 caused by removal of `KeysView`. 362 | 363 | ##### v0.9.9 364 | 365 | * Added support for accurate log message parameter extraction in a new function - `extract_parameters()`. The 366 | function `get_parameter_list()` is deprecated (Thanks to *@Impelon*). 367 | * Refactored `AbstractMaskingInstruction` as a base class for `RegexMaskingInstruction`, allowing to introduce other 368 | types of masking mechanisms. 369 | 370 | ##### v0.9.8 371 | 372 | * Added an option `full_search_strategy` option in `TemplateMiner.match()` and `Drain.match()`. See more info at 373 | Issue [#48](https://github.com/IBM/Drain3/issues/48). 374 | * Added an option to disable parameterization of tokens that contains digits in 375 | configuration: `TemplateMinerConfig.parametrize_numeric_tokens` 376 | * Loading Drain snapshot now only restores clusters state and not configuration parameters. This improves backwards 377 | compatibility when introducing new Drain configuration parameters. 378 | 379 | ##### v0.9.7 380 | 381 | * Fixed bug in original Drain: log clusters were created multiple times for log messages with fewer tokens 382 | than `max_node_depth`. 383 | * Changed `depth` property name to a more descriptive name `max_node_depth` as Drain always subtracts 2 of `depth` 384 | argument value. Also added `log_cluster_depth` property to reflect original value of depth argument (Breaking Change). 385 | * Restricted `depth` param to minimum sensible value of 3. 386 | * Added log cluster count to nodes in `Drain.print_tree()` 387 | * Added optional log cluster details to `Drain.print_tree()` 388 | 389 | ##### v0.9.6 390 | 391 | * Fix issue https://github.com/IBM/Drain3/issues/38: Unnecessary update of LRU cache in case `max_clusters` is used ( 392 | thanks *@StanislawSwierc*). 393 | 394 | ##### v0.9.5 395 | 396 | * Added: `TemplateMiner.match()` function for fast matching against existing clusters only. 397 | 398 | ##### v0.9.4 399 | 400 | * Added: `TemplateMiner.get_parameter_list()` function to extract template parameters for raw log message (thanks to * 401 | @cwyalpha*) 402 | * Added option to customize mask wrapper - Instead of the default `<*>`, `` etc, you can select any wrapper prefix 403 | or suffix by overriding `TemplateMinerConfig.mask_prefix` and `TemplateMinerConfig.mask_prefix` 404 | * Fixed: config `.ini` file is always read from same folder as source file in demos in tests (thanks *@RobinMaas95*) 405 | 406 | ##### v0.9.3 407 | 408 | * Fixed: comparison of type int with type str in function `add_seq_to_prefix_tree` #28 (bug introduced at v0.9.1) 409 | 410 | ##### v0.9.2 411 | 412 | * Updated jsonpickle version 413 | * Keys `id_to_cluster` dict are now persisted by jsonpickle as `int` instead of `str` to avoid keys type conversion on 414 | load snapshot which caused some issues. 415 | * Added cachetools dependency to `setup.py`. 416 | 417 | ##### v0.9.1 418 | 419 | * Added option to configure `TemplateMiner` using a configuration object (without `.ini` file). 420 | * Support for `print_tree()` to a file/stream. 421 | * Added `MemoryBufferPersistence` 422 | * Added unit tests for state save/load. 423 | * Bug fix: missing type-conversion in state loading, introduced in v0.9.0 424 | * Refactor: Drain prefix tree keys are now of type `str` also for 1st level 425 | (was `int` before), for type consistency. 426 | 427 | ##### v0.9.0 428 | 429 | * Decrease memory footprint of the main data structures. 430 | * Added `max_clusters` option to limit the number of tracked clusters. 431 | * Changed cluster identifier type from str to int 432 | * Added more unit tests and CI 433 | 434 | ##### v0.8.6 435 | 436 | * Added `extra_delimiters` configuration option to Drain 437 | 438 | ##### v0.8.5 439 | 440 | * Profiler improvements 441 | 442 | ##### v0.8.4 443 | 444 | * Masking speed improvement 445 | 446 | ##### v0.8.3 447 | 448 | * Fix: profiler state after load from snapshot 449 | 450 | ##### v0.8.2 451 | 452 | * Fixed snapshot backward compatibility to v0.7.9 453 | 454 | ##### v0.8.1 455 | 456 | * Bugfix in profiling configuration read 457 | 458 | ##### v0.8.0 459 | 460 | * Added time profiling support (disabled by default) 461 | * Added cluster ID to snapshot reason log (credit: @boernd) 462 | * Minor Readability and documentation improvements in Drain 463 | 464 | ##### v0.7.9 465 | 466 | * Fix: `KafkaPersistence` now accepts also `bootstrap_servers` as kwargs. 467 | 468 | ##### v0.7.8 469 | 470 | * Using `kafka-python` package instead of `kafka` (newer). 471 | * Added support for specifying additional configuration as `kwargs` in Kafka persistence handler. 472 | 473 | ##### v0.7.7 474 | 475 | * Corrected default Drain config values. 476 | 477 | ##### v0.7.6 478 | 479 | * Improvement in config file handling (Note: new sections were added instead of `DEFAULT` section) 480 | 481 | ##### v0.7.5 482 | 483 | * Made Kafka and Redis optional requirements 484 | 485 | -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. 2 | 3 | [[package]] 4 | name = "async-timeout" 5 | version = "4.0.2" 6 | description = "Timeout context manager for asyncio programs" 7 | optional = true 8 | python-versions = ">=3.6" 9 | files = [ 10 | {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"}, 11 | {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"}, 12 | ] 13 | 14 | [package.dependencies] 15 | typing-extensions = {version = ">=3.6.5", markers = "python_version < \"3.8\""} 16 | 17 | [[package]] 18 | name = "cachetools" 19 | version = "5.3.1" 20 | description = "Extensible memoizing collections and decorators" 21 | optional = false 22 | python-versions = ">=3.7" 23 | files = [ 24 | {file = "cachetools-5.3.1-py3-none-any.whl", hash = "sha256:95ef631eeaea14ba2e36f06437f36463aac3a096799e876ee55e5cdccb102590"}, 25 | {file = "cachetools-5.3.1.tar.gz", hash = "sha256:dce83f2d9b4e1f732a8cd44af8e8fab2dbe46201467fc98b3ef8f269092bf62b"}, 26 | ] 27 | 28 | [[package]] 29 | name = "cffi" 30 | version = "1.15.1" 31 | description = "Foreign Function Interface for Python calling C code." 32 | optional = false 33 | python-versions = "*" 34 | files = [ 35 | {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, 36 | {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, 37 | {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, 38 | {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, 39 | {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, 40 | {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, 41 | {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, 42 | {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, 43 | {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, 44 | {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, 45 | {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, 46 | {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, 47 | {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, 48 | {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, 49 | {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, 50 | {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, 51 | {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, 52 | {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, 53 | {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, 54 | {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, 55 | {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, 56 | {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, 57 | {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, 58 | {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, 59 | {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, 60 | {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, 61 | {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, 62 | {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, 63 | {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, 64 | {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, 65 | {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, 66 | {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, 67 | {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, 68 | {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, 69 | {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, 70 | {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, 71 | {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, 72 | {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, 73 | {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, 74 | {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, 75 | {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, 76 | {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, 77 | {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, 78 | {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, 79 | {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, 80 | {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, 81 | {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, 82 | {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, 83 | {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, 84 | {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, 85 | {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, 86 | {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, 87 | {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, 88 | {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, 89 | {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, 90 | {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, 91 | {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, 92 | {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, 93 | {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, 94 | {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, 95 | {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, 96 | {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, 97 | {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, 98 | {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, 99 | ] 100 | 101 | [package.dependencies] 102 | pycparser = "*" 103 | 104 | [[package]] 105 | name = "cryptography" 106 | version = "41.0.3" 107 | description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." 108 | optional = false 109 | python-versions = ">=3.7" 110 | files = [ 111 | {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"}, 112 | {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"}, 113 | {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"}, 114 | {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"}, 115 | {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"}, 116 | {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"}, 117 | {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"}, 118 | {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"}, 119 | {file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"}, 120 | {file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"}, 121 | {file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"}, 122 | {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"}, 123 | {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"}, 124 | {file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"}, 125 | {file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"}, 126 | {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"}, 127 | {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"}, 128 | {file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"}, 129 | {file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"}, 130 | {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"}, 131 | {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"}, 132 | {file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"}, 133 | {file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"}, 134 | ] 135 | 136 | [package.dependencies] 137 | cffi = ">=1.12" 138 | 139 | [package.extras] 140 | docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] 141 | docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] 142 | nox = ["nox"] 143 | pep8test = ["black", "check-sdist", "mypy", "ruff"] 144 | sdist = ["build"] 145 | ssh = ["bcrypt (>=3.1.5)"] 146 | test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] 147 | test-randomorder = ["pytest-randomly"] 148 | 149 | [[package]] 150 | name = "importlib-metadata" 151 | version = "6.7.0" 152 | description = "Read metadata from Python packages" 153 | optional = false 154 | python-versions = ">=3.7" 155 | files = [ 156 | {file = "importlib_metadata-6.7.0-py3-none-any.whl", hash = "sha256:cb52082e659e97afc5dac71e79de97d8681de3aa07ff18578330904a9d18e5b5"}, 157 | {file = "importlib_metadata-6.7.0.tar.gz", hash = "sha256:1aaf550d4f73e5d6783e7acb77aec43d49da8017410afae93822cc9cca98c4d4"}, 158 | ] 159 | 160 | [package.dependencies] 161 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} 162 | zipp = ">=0.5" 163 | 164 | [package.extras] 165 | docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] 166 | perf = ["ipython"] 167 | testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"] 168 | 169 | [[package]] 170 | name = "jsonpickle" 171 | version = "3.0.1" 172 | description = "Python library for serializing any arbitrary object graph into JSON" 173 | optional = false 174 | python-versions = ">=3.7" 175 | files = [ 176 | {file = "jsonpickle-3.0.1-py2.py3-none-any.whl", hash = "sha256:130d8b293ea0add3845de311aaba55e6d706d0bb17bc123bd2c8baf8a39ac77c"}, 177 | {file = "jsonpickle-3.0.1.tar.gz", hash = "sha256:032538804795e73b94ead410800ac387fdb6de98f8882ac957fcd247e3a85200"}, 178 | ] 179 | 180 | [package.dependencies] 181 | importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} 182 | 183 | [package.extras] 184 | docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"] 185 | testing = ["ecdsa", "feedparser", "gmpy2", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"] 186 | testing-libs = ["simplejson", "ujson"] 187 | 188 | [[package]] 189 | name = "kafka-python" 190 | version = "2.0.2" 191 | description = "Pure Python client for Apache Kafka" 192 | optional = true 193 | python-versions = "*" 194 | files = [ 195 | {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"}, 196 | {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"}, 197 | ] 198 | 199 | [package.extras] 200 | crc32c = ["crc32c"] 201 | 202 | [[package]] 203 | name = "mypy" 204 | version = "1.4.1" 205 | description = "Optional static typing for Python" 206 | optional = false 207 | python-versions = ">=3.7" 208 | files = [ 209 | {file = "mypy-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:566e72b0cd6598503e48ea610e0052d1b8168e60a46e0bfd34b3acf2d57f96a8"}, 210 | {file = "mypy-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ca637024ca67ab24a7fd6f65d280572c3794665eaf5edcc7e90a866544076878"}, 211 | {file = "mypy-1.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dde1d180cd84f0624c5dcaaa89c89775550a675aff96b5848de78fb11adabcd"}, 212 | {file = "mypy-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8c4d8e89aa7de683e2056a581ce63c46a0c41e31bd2b6d34144e2c80f5ea53dc"}, 213 | {file = "mypy-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:bfdca17c36ae01a21274a3c387a63aa1aafe72bff976522886869ef131b937f1"}, 214 | {file = "mypy-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7549fbf655e5825d787bbc9ecf6028731973f78088fbca3a1f4145c39ef09462"}, 215 | {file = "mypy-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98324ec3ecf12296e6422939e54763faedbfcc502ea4a4c38502082711867258"}, 216 | {file = "mypy-1.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:141dedfdbfe8a04142881ff30ce6e6653c9685b354876b12e4fe6c78598b45e2"}, 217 | {file = "mypy-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8207b7105829eca6f3d774f64a904190bb2231de91b8b186d21ffd98005f14a7"}, 218 | {file = "mypy-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:16f0db5b641ba159eff72cff08edc3875f2b62b2fa2bc24f68c1e7a4e8232d01"}, 219 | {file = "mypy-1.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:470c969bb3f9a9efcedbadcd19a74ffb34a25f8e6b0e02dae7c0e71f8372f97b"}, 220 | {file = "mypy-1.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5952d2d18b79f7dc25e62e014fe5a23eb1a3d2bc66318df8988a01b1a037c5b"}, 221 | {file = "mypy-1.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:190b6bab0302cec4e9e6767d3eb66085aef2a1cc98fe04936d8a42ed2ba77bb7"}, 222 | {file = "mypy-1.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:9d40652cc4fe33871ad3338581dca3297ff5f2213d0df345bcfbde5162abf0c9"}, 223 | {file = "mypy-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:01fd2e9f85622d981fd9063bfaef1aed6e336eaacca00892cd2d82801ab7c042"}, 224 | {file = "mypy-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2460a58faeea905aeb1b9b36f5065f2dc9a9c6e4c992a6499a2360c6c74ceca3"}, 225 | {file = "mypy-1.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2746d69a8196698146a3dbe29104f9eb6a2a4d8a27878d92169a6c0b74435b6"}, 226 | {file = "mypy-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ae704dcfaa180ff7c4cfbad23e74321a2b774f92ca77fd94ce1049175a21c97f"}, 227 | {file = "mypy-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:43d24f6437925ce50139a310a64b2ab048cb2d3694c84c71c3f2a1626d8101dc"}, 228 | {file = "mypy-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c482e1246726616088532b5e964e39765b6d1520791348e6c9dc3af25b233828"}, 229 | {file = "mypy-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:43b592511672017f5b1a483527fd2684347fdffc041c9ef53428c8dc530f79a3"}, 230 | {file = "mypy-1.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34a9239d5b3502c17f07fd7c0b2ae6b7dd7d7f6af35fbb5072c6208e76295816"}, 231 | {file = "mypy-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5703097c4936bbb9e9bce41478c8d08edd2865e177dc4c52be759f81ee4dd26c"}, 232 | {file = "mypy-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e02d700ec8d9b1859790c0475df4e4092c7bf3272a4fd2c9f33d87fac4427b8f"}, 233 | {file = "mypy-1.4.1-py3-none-any.whl", hash = "sha256:45d32cec14e7b97af848bddd97d85ea4f0db4d5a149ed9676caa4eb2f7402bb4"}, 234 | {file = "mypy-1.4.1.tar.gz", hash = "sha256:9bbcd9ab8ea1f2e1c8031c21445b511442cc45c89951e49bbf852cbb70755b1b"}, 235 | ] 236 | 237 | [package.dependencies] 238 | mypy-extensions = ">=1.0.0" 239 | tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} 240 | typed-ast = {version = ">=1.4.0,<2", markers = "python_version < \"3.8\""} 241 | typing-extensions = ">=4.1.0" 242 | 243 | [package.extras] 244 | dmypy = ["psutil (>=4.0)"] 245 | install-types = ["pip"] 246 | python2 = ["typed-ast (>=1.4.0,<2)"] 247 | reports = ["lxml"] 248 | 249 | [[package]] 250 | name = "mypy-extensions" 251 | version = "1.0.0" 252 | description = "Type system extensions for programs checked with the mypy type checker." 253 | optional = false 254 | python-versions = ">=3.5" 255 | files = [ 256 | {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, 257 | {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, 258 | ] 259 | 260 | [[package]] 261 | name = "pycparser" 262 | version = "2.21" 263 | description = "C parser in Python" 264 | optional = false 265 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 266 | files = [ 267 | {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, 268 | {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, 269 | ] 270 | 271 | [[package]] 272 | name = "redis" 273 | version = "4.6.0" 274 | description = "Python client for Redis database and key-value store" 275 | optional = true 276 | python-versions = ">=3.7" 277 | files = [ 278 | {file = "redis-4.6.0-py3-none-any.whl", hash = "sha256:e2b03db868160ee4591de3cb90d40ebb50a90dd302138775937f6a42b7ed183c"}, 279 | {file = "redis-4.6.0.tar.gz", hash = "sha256:585dc516b9eb042a619ef0a39c3d7d55fe81bdb4df09a52c9cdde0d07bf1aa7d"}, 280 | ] 281 | 282 | [package.dependencies] 283 | async-timeout = {version = ">=4.0.2", markers = "python_full_version <= \"3.11.2\""} 284 | importlib-metadata = {version = ">=1.0", markers = "python_version < \"3.8\""} 285 | typing-extensions = {version = "*", markers = "python_version < \"3.8\""} 286 | 287 | [package.extras] 288 | hiredis = ["hiredis (>=1.0.0)"] 289 | ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)"] 290 | 291 | [[package]] 292 | name = "tomli" 293 | version = "2.0.1" 294 | description = "A lil' TOML parser" 295 | optional = false 296 | python-versions = ">=3.7" 297 | files = [ 298 | {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, 299 | {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, 300 | ] 301 | 302 | [[package]] 303 | name = "typed-ast" 304 | version = "1.5.5" 305 | description = "a fork of Python 2 and 3 ast modules with type comment support" 306 | optional = false 307 | python-versions = ">=3.6" 308 | files = [ 309 | {file = "typed_ast-1.5.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4bc1efe0ce3ffb74784e06460f01a223ac1f6ab31c6bc0376a21184bf5aabe3b"}, 310 | {file = "typed_ast-1.5.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5f7a8c46a8b333f71abd61d7ab9255440d4a588f34a21f126bbfc95f6049e686"}, 311 | {file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:597fc66b4162f959ee6a96b978c0435bd63791e31e4f410622d19f1686d5e769"}, 312 | {file = "typed_ast-1.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d41b7a686ce653e06c2609075d397ebd5b969d821b9797d029fccd71fdec8e04"}, 313 | {file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5fe83a9a44c4ce67c796a1b466c270c1272e176603d5e06f6afbc101a572859d"}, 314 | {file = "typed_ast-1.5.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d5c0c112a74c0e5db2c75882a0adf3133adedcdbfd8cf7c9d6ed77365ab90a1d"}, 315 | {file = "typed_ast-1.5.5-cp310-cp310-win_amd64.whl", hash = "sha256:e1a976ed4cc2d71bb073e1b2a250892a6e968ff02aa14c1f40eba4f365ffec02"}, 316 | {file = "typed_ast-1.5.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c631da9710271cb67b08bd3f3813b7af7f4c69c319b75475436fcab8c3d21bee"}, 317 | {file = "typed_ast-1.5.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b445c2abfecab89a932b20bd8261488d574591173d07827c1eda32c457358b18"}, 318 | {file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc95ffaaab2be3b25eb938779e43f513e0e538a84dd14a5d844b8f2932593d88"}, 319 | {file = "typed_ast-1.5.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61443214d9b4c660dcf4b5307f15c12cb30bdfe9588ce6158f4a005baeb167b2"}, 320 | {file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6eb936d107e4d474940469e8ec5b380c9b329b5f08b78282d46baeebd3692dc9"}, 321 | {file = "typed_ast-1.5.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e48bf27022897577d8479eaed64701ecaf0467182448bd95759883300ca818c8"}, 322 | {file = "typed_ast-1.5.5-cp311-cp311-win_amd64.whl", hash = "sha256:83509f9324011c9a39faaef0922c6f720f9623afe3fe220b6d0b15638247206b"}, 323 | {file = "typed_ast-1.5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:44f214394fc1af23ca6d4e9e744804d890045d1643dd7e8229951e0ef39429b5"}, 324 | {file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:118c1ce46ce58fda78503eae14b7664163aa735b620b64b5b725453696f2a35c"}, 325 | {file = "typed_ast-1.5.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be4919b808efa61101456e87f2d4c75b228f4e52618621c77f1ddcaae15904fa"}, 326 | {file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:fc2b8c4e1bc5cd96c1a823a885e6b158f8451cf6f5530e1829390b4d27d0807f"}, 327 | {file = "typed_ast-1.5.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:16f7313e0a08c7de57f2998c85e2a69a642e97cb32f87eb65fbfe88381a5e44d"}, 328 | {file = "typed_ast-1.5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:2b946ef8c04f77230489f75b4b5a4a6f24c078be4aed241cfabe9cbf4156e7e5"}, 329 | {file = "typed_ast-1.5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2188bc33d85951ea4ddad55d2b35598b2709d122c11c75cffd529fbc9965508e"}, 330 | {file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0635900d16ae133cab3b26c607586131269f88266954eb04ec31535c9a12ef1e"}, 331 | {file = "typed_ast-1.5.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57bfc3cf35a0f2fdf0a88a3044aafaec1d2f24d8ae8cd87c4f58d615fb5b6311"}, 332 | {file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:fe58ef6a764de7b4b36edfc8592641f56e69b7163bba9f9c8089838ee596bfb2"}, 333 | {file = "typed_ast-1.5.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d09d930c2d1d621f717bb217bf1fe2584616febb5138d9b3e8cdd26506c3f6d4"}, 334 | {file = "typed_ast-1.5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:d40c10326893ecab8a80a53039164a224984339b2c32a6baf55ecbd5b1df6431"}, 335 | {file = "typed_ast-1.5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fd946abf3c31fb50eee07451a6aedbfff912fcd13cf357363f5b4e834cc5e71a"}, 336 | {file = "typed_ast-1.5.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ed4a1a42df8a3dfb6b40c3d2de109e935949f2f66b19703eafade03173f8f437"}, 337 | {file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:045f9930a1550d9352464e5149710d56a2aed23a2ffe78946478f7b5416f1ede"}, 338 | {file = "typed_ast-1.5.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:381eed9c95484ceef5ced626355fdc0765ab51d8553fec08661dce654a935db4"}, 339 | {file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:bfd39a41c0ef6f31684daff53befddae608f9daf6957140228a08e51f312d7e6"}, 340 | {file = "typed_ast-1.5.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8c524eb3024edcc04e288db9541fe1f438f82d281e591c548903d5b77ad1ddd4"}, 341 | {file = "typed_ast-1.5.5-cp38-cp38-win_amd64.whl", hash = "sha256:7f58fabdde8dcbe764cef5e1a7fcb440f2463c1bbbec1cf2a86ca7bc1f95184b"}, 342 | {file = "typed_ast-1.5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:042eb665ff6bf020dd2243307d11ed626306b82812aba21836096d229fdc6a10"}, 343 | {file = "typed_ast-1.5.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:622e4a006472b05cf6ef7f9f2636edc51bda670b7bbffa18d26b255269d3d814"}, 344 | {file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1efebbbf4604ad1283e963e8915daa240cb4bf5067053cf2f0baadc4d4fb51b8"}, 345 | {file = "typed_ast-1.5.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0aefdd66f1784c58f65b502b6cf8b121544680456d1cebbd300c2c813899274"}, 346 | {file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:48074261a842acf825af1968cd912f6f21357316080ebaca5f19abbb11690c8a"}, 347 | {file = "typed_ast-1.5.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:429ae404f69dc94b9361bb62291885894b7c6fb4640d561179548c849f8492ba"}, 348 | {file = "typed_ast-1.5.5-cp39-cp39-win_amd64.whl", hash = "sha256:335f22ccb244da2b5c296e6f96b06ee9bed46526db0de38d2f0e5a6597b81155"}, 349 | {file = "typed_ast-1.5.5.tar.gz", hash = "sha256:94282f7a354f36ef5dbce0ef3467ebf6a258e370ab33d5b40c249fa996e590dd"}, 350 | ] 351 | 352 | [[package]] 353 | name = "types-cachetools" 354 | version = "5.3.0.6" 355 | description = "Typing stubs for cachetools" 356 | optional = false 357 | python-versions = "*" 358 | files = [ 359 | {file = "types-cachetools-5.3.0.6.tar.gz", hash = "sha256:595f0342d246c8ba534f5a762cf4c2f60ecb61e8002b8b2277fd5cf791d4e851"}, 360 | {file = "types_cachetools-5.3.0.6-py3-none-any.whl", hash = "sha256:f7f8a25bfe306f2e6bc2ad0a2f949d9e72f2d91036d509c36d3810bf728bc6e1"}, 361 | ] 362 | 363 | [[package]] 364 | name = "types-pyopenssl" 365 | version = "23.2.0.2" 366 | description = "Typing stubs for pyOpenSSL" 367 | optional = false 368 | python-versions = "*" 369 | files = [ 370 | {file = "types-pyOpenSSL-23.2.0.2.tar.gz", hash = "sha256:6a010dac9ecd42b582d7dd2cc3e9e40486b79b3b64bb2fffba1474ff96af906d"}, 371 | {file = "types_pyOpenSSL-23.2.0.2-py3-none-any.whl", hash = "sha256:19536aa3debfbe25a918cf0d898e9f5fbbe6f3594a429da7914bf331deb1b342"}, 372 | ] 373 | 374 | [package.dependencies] 375 | cryptography = ">=35.0.0" 376 | 377 | [[package]] 378 | name = "types-redis" 379 | version = "4.6.0.3" 380 | description = "Typing stubs for redis" 381 | optional = false 382 | python-versions = "*" 383 | files = [ 384 | {file = "types-redis-4.6.0.3.tar.gz", hash = "sha256:efdef37dc0c04bf5786195651fd694f8bfdd693eac09ec4af46d90f72652558f"}, 385 | {file = "types_redis-4.6.0.3-py3-none-any.whl", hash = "sha256:67c44c14369c33c2a300da2a50b5607c0fc888f7b85eeb7c73e15c78a0f05edd"}, 386 | ] 387 | 388 | [package.dependencies] 389 | cryptography = ">=35.0.0" 390 | types-pyOpenSSL = "*" 391 | 392 | [[package]] 393 | name = "typing-extensions" 394 | version = "4.7.1" 395 | description = "Backported and Experimental Type Hints for Python 3.7+" 396 | optional = false 397 | python-versions = ">=3.7" 398 | files = [ 399 | {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, 400 | {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, 401 | ] 402 | 403 | [[package]] 404 | name = "zipp" 405 | version = "3.15.0" 406 | description = "Backport of pathlib-compatible object wrapper for zip files" 407 | optional = false 408 | python-versions = ">=3.7" 409 | files = [ 410 | {file = "zipp-3.15.0-py3-none-any.whl", hash = "sha256:48904fc76a60e542af151aded95726c1a5c34ed43ab4134b597665c86d7ad556"}, 411 | {file = "zipp-3.15.0.tar.gz", hash = "sha256:112929ad649da941c23de50f356a2b5570c954b65150642bccdd66bf194d224b"}, 412 | ] 413 | 414 | [package.extras] 415 | docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] 416 | testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] 417 | 418 | [extras] 419 | all = ["kafka-python", "redis"] 420 | kafka = ["kafka-python"] 421 | redis = ["redis"] 422 | 423 | [metadata] 424 | lock-version = "2.0" 425 | python-versions = "^3.7" 426 | content-hash = "5b3714ad91781510078e19fc5e5ce57bcb57ff69edd98764b34f91c5bc509ea4" 427 | --------------------------------------------------------------------------------