├── maco ├── __init__.py ├── model │ ├── __init__.py │ └── model.py ├── exceptions.py ├── extractor.py ├── base_test.py ├── yara.py ├── collector.py ├── cli.py └── utils.py ├── demo_extractors ├── __init__.py ├── complex │ ├── __init__.py │ ├── complex_utils.py │ └── complex.py ├── requirements.txt ├── shared.py ├── nothing.py ├── terminator.py ├── elfy.py └── limit_other.py ├── extractor_setup ├── maco ├── README.md ├── LICENSE.md └── pyproject.toml ├── model_setup ├── maco ├── README.md ├── LICENSE.md └── pyproject.toml ├── tests ├── extractors │ ├── __init__.py │ ├── bob │ │ ├── __init__.py │ │ └── bob.py │ ├── import_rewriting │ │ └── __init__.py │ ├── test_basic.py │ ├── basic.py │ └── basic_longer.py ├── requirements.txt ├── data │ ├── example.txt.cart │ ├── trigger_complex.txt │ └── trigger_complex.txt.cart ├── pytest.ini ├── test_cli.py ├── test_extractor.py ├── test_demo_extractors.py ├── test_helpers.py ├── test_parallelism.py ├── test_base_test.py ├── test_detection.py ├── benchmark.py └── test_model.py ├── requirements.txt ├── .vscode ├── extensions.json └── settings.json ├── .pre-commit-config.yaml ├── tox.ini ├── LICENSE.md ├── pipelines ├── test.yaml └── publish.yaml ├── pyproject.toml ├── .gitignore └── README.md /maco/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo_extractors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /extractor_setup/maco: -------------------------------------------------------------------------------- 1 | ../maco/ -------------------------------------------------------------------------------- /model_setup/maco: -------------------------------------------------------------------------------- 1 | ../maco/ -------------------------------------------------------------------------------- /tests/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model_setup/README.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /tests/extractors/bob/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo_extractors/complex/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /extractor_setup/README.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /model_setup/LICENSE.md: -------------------------------------------------------------------------------- 1 | ../LICENSE.md -------------------------------------------------------------------------------- /extractor_setup/LICENSE.md: -------------------------------------------------------------------------------- 1 | ../LICENSE.md -------------------------------------------------------------------------------- /tests/extractors/import_rewriting/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | git-python 3 | -------------------------------------------------------------------------------- /maco/model/__init__.py: -------------------------------------------------------------------------------- 1 | from maco.model.model import * # noqa: F403 2 | -------------------------------------------------------------------------------- /demo_extractors/requirements.txt: -------------------------------------------------------------------------------- 1 | httpx 2 | 3 | # Install maco from source for testing 4 | ../ 5 | -------------------------------------------------------------------------------- /tests/data/example.txt.cart: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CybercentreCanada/Maco/HEAD/tests/data/example.txt.cart -------------------------------------------------------------------------------- /tests/data/trigger_complex.txt: -------------------------------------------------------------------------------- 1 | file to trigger demo extractors 2 | 3 | self_trigger 4 | 5 | Complex 6 | Paradise 7 | -------------------------------------------------------------------------------- /tests/data/trigger_complex.txt.cart: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CybercentreCanada/Maco/HEAD/tests/data/trigger_complex.txt.cart -------------------------------------------------------------------------------- /tests/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | minversion = 6.0 3 | addopts = -ra -q -k "not git and not extractors" 4 | testpaths = 5 | tests 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cart 2 | pydantic>=2.0.0 3 | tomli >= 1.1.0 ; python_version < "3.11" 4 | uv 5 | yara-x 6 | multiprocess>=0.70.17 7 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-python.python", 4 | "charliermarsh.ruff", 5 | "elagil.pre-commit-helper" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | # Ruff version. 4 | rev: v0.9.9 5 | hooks: 6 | # Run the linter. 7 | - id: ruff 8 | args: [--fix, --preview] 9 | # Run the formatter. 10 | - id: ruff-format 11 | -------------------------------------------------------------------------------- /tests/extractors/bob/bob.py: -------------------------------------------------------------------------------- 1 | """Simple extractor for testing module and submodule with the same name.""" 2 | 3 | from maco import extractor 4 | 5 | 6 | class Bob(extractor.Extractor): 7 | """A simplistic script for testing.""" 8 | 9 | family = "bob" 10 | author = "bob" 11 | last_modified = "2022-06-14" 12 | -------------------------------------------------------------------------------- /demo_extractors/complex/complex_utils.py: -------------------------------------------------------------------------------- 1 | """Example of a complex function invoked by the extractor.""" 2 | 3 | from typing import Dict 4 | 5 | 6 | def getdata() -> Dict[str, int]: 7 | """This could be some complex and long function to support the main script. 8 | 9 | Returns: 10 | (Dict[str, int]): returns mock results 11 | """ 12 | return {"result": 5} 13 | -------------------------------------------------------------------------------- /demo_extractors/shared.py: -------------------------------------------------------------------------------- 1 | """Custom model based on Maco's model.""" 2 | 3 | from typing import Optional 4 | 5 | import pydantic 6 | 7 | from maco import model 8 | 9 | 10 | class MyCustomModel(model.ExtractorModel): 11 | """Custom model based on Maco's model.""" 12 | 13 | class Other(pydantic.BaseModel): 14 | """Custom 'other' class.""" 15 | 16 | key1: str 17 | key2: bool 18 | key3: int 19 | 20 | # set a custom class here as valid for the 'other' property 21 | other: Optional[Other] = None 22 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | """CLI testing.""" 2 | 3 | import os 4 | import unittest 5 | 6 | from maco import cli 7 | 8 | 9 | class TestCLI(unittest.TestCase): 10 | """Test CLI.""" 11 | 12 | def test_process_filesystem(self): 13 | """Test process_filesystem.""" 14 | maco_path = os.path.abspath(os.path.join(__file__, "../../demo_extractors")) 15 | test_path = os.path.abspath(os.path.join(__file__, "../data")) 16 | results = cli.process_filesystem( 17 | maco_path, 18 | test_path, 19 | include=[], 20 | exclude=[], 21 | pretty=True, 22 | force=False, 23 | include_base64=False, 24 | ) 25 | self.assertEqual(results, (3, 3, 3)) 26 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[python]": { 3 | "editor.defaultFormatter": "charliermarsh.ruff" 4 | }, 5 | "editor.codeActionsOnSave": { 6 | "source.organizeImports": "explicit" 7 | }, 8 | "editor.formatOnSave": true, 9 | "editor.rulers": [ 10 | 120 11 | ], 12 | "editor.tabSize": 4, 13 | "editor.wordWrap": "wordWrapColumn", 14 | "editor.wordWrapColumn": 120, 15 | "files.insertFinalNewline": true, 16 | "files.trimFinalNewlines": true, 17 | "files.trimTrailingWhitespace": true, 18 | "pre-commit-helper.runOnSave": "all hooks", 19 | "python.testing.pytestArgs": [ 20 | "tests" 21 | ], 22 | "python.testing.pytestEnabled": true, 23 | "ruff.lint.enable": true, 24 | "ruff.configuration": "pyproject.toml", 25 | "ruff.lint.preview": true 26 | } 27 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py38,py39,py310,py311,py312,parallelism,style 3 | [testenv] 4 | # install testing framework 5 | deps = 6 | pytest 7 | -r requirements.txt 8 | -r tests/requirements.txt 9 | # run the tests 10 | commands = python -m pytest tests/ -p no:cacheprovider --durations=10 -ra -q -k "not git and not extractors and not parallelism" -vv -W ignore::DeprecationWarning 11 | 12 | [testenv:style] 13 | # install testing framework 14 | deps = 15 | ruff 16 | # run the tests 17 | commands = 18 | ruff format --check 19 | ruff check 20 | 21 | [testenv:parallelism] 22 | # install parallel testing framework 23 | deps = 24 | pytest 25 | pytest-xdist 26 | -r requirements.txt 27 | -r tests/requirements.txt 28 | # run parallel tests 29 | commands = python -m pytest tests/test_parallelism.py -p no:cacheprovider -n 4 -vv -W ignore::DeprecationWarning 30 | -------------------------------------------------------------------------------- /maco/exceptions.py: -------------------------------------------------------------------------------- 1 | """Exception classes for extractors.""" 2 | 3 | 4 | # Can be raised by extractors to abort analysis of a sample 5 | # ie. Can abort if preliminary checks at start of run indicate the file shouldn't be analyzed by extractor 6 | class AnalysisAbortedException(Exception): 7 | """Raised when extractors voluntarily abort analysis of a sample.""" 8 | 9 | pass 10 | 11 | 12 | class ExtractorLoadError(Exception): 13 | """Raised when extractors cannot be loaded.""" 14 | 15 | pass 16 | 17 | 18 | class InvalidExtractor(ValueError): 19 | """Raised when an extractor is invalid.""" 20 | 21 | pass 22 | 23 | 24 | class NoHitException(Exception): 25 | """Raised when the YARA rule of an extractor doesn't hit.""" 26 | 27 | pass 28 | 29 | 30 | class SyntaxError(Exception): 31 | """Raised when there's a syntax error in the YARA rule.""" 32 | 33 | pass 34 | -------------------------------------------------------------------------------- /tests/extractors/test_basic.py: -------------------------------------------------------------------------------- 1 | """Test basic extractors.""" 2 | 3 | import io 4 | import os 5 | 6 | from maco import base_test 7 | 8 | 9 | class TestBasicLonger(base_test.BaseTest): 10 | """Test that an extractor containing the name of another extractor works properly.""" 11 | 12 | name = "BasicLonger" 13 | path = os.path.join(__file__, "..") 14 | 15 | def test_run(self): 16 | """Test run.""" 17 | ret = self.extract(io.BytesIO(b"BasicLonger")) 18 | self.assertEqual(ret["family"], "basic_longer") 19 | 20 | 21 | class TestBasic(base_test.BaseTest): 22 | """Test that an extractor containing the name of another extractor works properly.""" 23 | 24 | name = "Basic" 25 | path = os.path.join(__file__, "..") 26 | 27 | def test_run(self): 28 | """Test run.""" 29 | ret = self.extract(io.BytesIO(b"Basic")) 30 | self.assertEqual(ret["family"], "basic") 31 | -------------------------------------------------------------------------------- /demo_extractors/nothing.py: -------------------------------------------------------------------------------- 1 | """Demo extractor that returns nothing.""" 2 | 3 | from io import BytesIO 4 | from typing import List 5 | 6 | from maco import extractor, yara 7 | 8 | 9 | class Nothing(extractor.Extractor): 10 | """Returns no extracted data.""" 11 | 12 | family = "nothing" 13 | author = "blue" 14 | last_modified = "2022-06-14" 15 | yara_rule = """ 16 | rule Nothing 17 | { 18 | strings: 19 | $self_trigger = "Nothing" 20 | 21 | condition: 22 | $self_trigger 23 | } 24 | """ 25 | 26 | def run(self, stream: BytesIO, matches: List[yara.Match]): 27 | """Run the analysis process. 28 | 29 | Args: 30 | stream (BytesIO): file object from disk/network/memory. 31 | matches (List[yara.Match]): yara rule matches 32 | """ 33 | # return config model formatted results 34 | return 35 | -------------------------------------------------------------------------------- /demo_extractors/terminator.py: -------------------------------------------------------------------------------- 1 | """Example extractor that terminates early during extraction.""" 2 | 3 | from io import BytesIO 4 | from typing import List, Optional 5 | 6 | from maco import extractor, model, yara 7 | from maco.exceptions import AnalysisAbortedException 8 | 9 | 10 | class Terminator(extractor.Extractor): 11 | """Terminates early during extraction.""" 12 | 13 | family = "terminator" 14 | author = "skynet" 15 | last_modified = "1997-08-29" 16 | 17 | def run(self, stream: BytesIO, matches: List[yara.Match]) -> Optional[model.ExtractorModel]: 18 | """Run the analysis process but terminate early. 19 | 20 | Args: 21 | stream (BytesIO): file object from disk/network/memory. 22 | matches (List[yara.Match]): yara rule matches 23 | 24 | Raises: 25 | AnalysisAbortedException: Extractor has decided to terminate early 26 | """ 27 | # Terminate early and indicate I can't run on this sample 28 | raise AnalysisAbortedException("I can't run on this sample") 29 | -------------------------------------------------------------------------------- /tests/extractors/basic.py: -------------------------------------------------------------------------------- 1 | """Basic extractor.""" 2 | 3 | from io import BytesIO 4 | from typing import List 5 | 6 | from maco import extractor, model, yara 7 | 8 | 9 | class Basic(extractor.Extractor): 10 | """A simplistic script for testing.""" 11 | 12 | family = "basic" 13 | author = "blue" 14 | last_modified = "2022-06-14" 15 | yara_rule = """ 16 | rule Basic 17 | { 18 | strings: 19 | $self_trigger = "Basic" 20 | 21 | condition: 22 | $self_trigger 23 | } 24 | """ 25 | 26 | def run(self, stream: BytesIO, matches: List[yara.Match]) -> model.ExtractorModel: 27 | """Run the extractor. 28 | 29 | Returns: 30 | (model.ExtractorModel): Results from extractor 31 | 32 | """ 33 | # use a custom model that inherits from ExtractorModel 34 | # this model defines what can go in the 'other' dict 35 | tmp = model.ExtractorModel(family="basic") 36 | tmp.campaign_id.append("12345") 37 | tmp.other = dict(key1="key1", key2=True, key3=45) 38 | return tmp 39 | -------------------------------------------------------------------------------- /tests/extractors/basic_longer.py: -------------------------------------------------------------------------------- 1 | """Basic longer extractor.""" 2 | 3 | from io import BytesIO 4 | from typing import List 5 | 6 | from maco import extractor, model, yara 7 | 8 | 9 | class BasicLonger(extractor.Extractor): 10 | """A simplistic script for testing.""" 11 | 12 | family = "basic_longer" 13 | author = "blue" 14 | last_modified = "2022-06-14" 15 | yara_rule = """ 16 | rule BasicLonger 17 | { 18 | strings: 19 | $self_trigger = "BasicLonger" 20 | 21 | condition: 22 | $self_trigger 23 | } 24 | """ 25 | 26 | def run(self, stream: BytesIO, matches: List[yara.Match]) -> model.ExtractorModel: 27 | """Run the extractor. 28 | 29 | Returns: 30 | (model.ExtractorModel): Results from extractor 31 | """ 32 | # use a custom model that inherits from ExtractorModel 33 | # this model defines what can go in the 'other' dict 34 | tmp = model.ExtractorModel(family="basic_longer") 35 | tmp.campaign_id.append("12345") 36 | tmp.other = dict(key1="key1", key2=True, key3=45) 37 | return tmp 38 | -------------------------------------------------------------------------------- /demo_extractors/elfy.py: -------------------------------------------------------------------------------- 1 | """Demo extractor that targets ELF files.""" 2 | 3 | from io import BytesIO 4 | from typing import List, Optional 5 | 6 | from maco import extractor, model, yara 7 | 8 | 9 | class Elfy(extractor.Extractor): 10 | """Check basic elf property.""" 11 | 12 | family = "elfy" 13 | author = "blue" 14 | last_modified = "2022-06-14" 15 | yara_rule = """ 16 | import "elf" 17 | 18 | rule Elfy 19 | { 20 | condition: 21 | elf.number_of_sections > 50 22 | } 23 | """ 24 | 25 | def run(self, stream: BytesIO, matches: List[yara.Match]) -> Optional[model.ExtractorModel]: 26 | """Run the analysis process. 27 | 28 | Args: 29 | stream (BytesIO): file object from disk/network/memory. 30 | matches (List[yara.Match]): yara rule matches 31 | 32 | Returns: 33 | (Optional[model.ExtractorModel]): model of results 34 | 35 | """ 36 | # return config model formatted results 37 | ret = model.ExtractorModel(family=self.family) 38 | # the list for campaign_id already exists and is empty, so we just add an item 39 | ret.campaign_id.append(str(len(stream.read()))) 40 | return ret 41 | -------------------------------------------------------------------------------- /model_setup/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "setuptools_scm>=8"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "maco-model" 7 | dynamic = ["version"] 8 | description = "This package contains only the Pydantic model for Maco." 9 | dependencies = ["pydantic>=2.0.0"] 10 | requires-python = ">=3.8" 11 | authors = [{ name = "sl-govau" }] 12 | maintainers = [{ name = "cccs-rs" }] 13 | readme = "README.md" 14 | license = { file = "LICENSE.md" } 15 | 16 | classifiers = [ 17 | "Development Status :: 5 - Production/Stable", 18 | "Intended Audience :: Developers", 19 | 20 | "Topic :: Software Development :: Libraries :: Python Modules", 21 | 22 | "License :: OSI Approved :: MIT License", 23 | 24 | "Programming Language :: Python :: 3.8", 25 | "Programming Language :: Python :: 3.9", 26 | "Programming Language :: Python :: 3.10", 27 | "Programming Language :: Python :: 3.11", 28 | "Programming Language :: Python :: 3.12", 29 | ] 30 | 31 | 32 | [project.urls] 33 | Repository = "https://github.com/CybercentreCanada/Maco" 34 | Issues = "https://github.com/CybercentreCanada/Maco/issues" 35 | 36 | [tool.setuptools_scm] 37 | root = ".." 38 | 39 | [tool.setuptools] 40 | packages = ["maco.model"] 41 | -------------------------------------------------------------------------------- /extractor_setup/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "setuptools_scm>=8"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "maco-extractor" 7 | description = "This package contains the essentials for creating Maco extractors and using them at runtime." 8 | dynamic = ["version"] 9 | dependencies = ["pydantic>=2.0.0", "yara-x"] 10 | requires-python = ">=3.8" 11 | authors = [{ name = "sl-govau" }] 12 | maintainers = [{ name = "cccs-rs" }] 13 | readme = "README.md" 14 | license = { file = "LICENSE.md" } 15 | 16 | classifiers = [ 17 | "Development Status :: 5 - Production/Stable", 18 | "Intended Audience :: Developers", 19 | 20 | "Topic :: Software Development :: Libraries :: Python Modules", 21 | 22 | "License :: OSI Approved :: MIT License", 23 | 24 | "Programming Language :: Python :: 3.8", 25 | "Programming Language :: Python :: 3.9", 26 | "Programming Language :: Python :: 3.10", 27 | "Programming Language :: Python :: 3.11", 28 | "Programming Language :: Python :: 3.12", 29 | ] 30 | 31 | 32 | [project.urls] 33 | Repository = "https://github.com/CybercentreCanada/Maco" 34 | Issues = "https://github.com/CybercentreCanada/Maco/issues" 35 | 36 | [tool.setuptools_scm] 37 | root = ".." 38 | 39 | [tool.setuptools] 40 | packages = ["maco.model"] 41 | py-modules = ["maco.extractor", "maco.yara", "maco.exceptions"] 42 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Crown Copyright, Government of Canada (Canadian Centre for Cyber Security / Communications Security Establishment) and Government of Australia (Australian Cyber Security Centre / Australian Signals Directorate) 4 | 5 | Copyright title to all 3rd party software distributed with maco is held by the respective copyright holders as noted in those files. Users are asked to read the 3rd Party Licenses referenced with those assets. 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 12 | -------------------------------------------------------------------------------- /tests/test_extractor.py: -------------------------------------------------------------------------------- 1 | """Extractor testing.""" 2 | 3 | import unittest 4 | 5 | from maco import extractor 6 | 7 | 8 | class TestExtractor(unittest.TestCase): 9 | """Test extractor.""" 10 | 11 | def test_bad(self): 12 | """Test bad extractor.""" 13 | 14 | class Tmp(extractor.Extractor): 15 | family = "smell_ya_later" 16 | author = "me" 17 | last_modified = "yeah" 18 | 19 | Tmp() 20 | 21 | class Tmp1(Tmp): 22 | family = None 23 | 24 | self.assertRaises(extractor.InvalidExtractor, Tmp1) 25 | 26 | class Tmp1(extractor.Extractor): 27 | author = None 28 | 29 | self.assertRaises(extractor.InvalidExtractor, Tmp1) 30 | 31 | class Tmp1(extractor.Extractor): 32 | version = None 33 | 34 | self.assertRaises(extractor.InvalidExtractor, Tmp1) 35 | 36 | class Tmp1(Tmp): 37 | yara_rule: str = "t" 38 | 39 | self.assertRaises(extractor.InvalidExtractor, Tmp1) 40 | 41 | class Tmp1(Tmp): 42 | yara_rule = """ 43 | rule DifferentName 44 | { 45 | condition: 46 | true 47 | } 48 | """ 49 | 50 | Tmp1() 51 | 52 | class Tmp1(Tmp): 53 | yara_rule = """ 54 | rule Tmp1 55 | { 56 | condition: 57 | true 58 | } 59 | rule OtherName 60 | { 61 | condition: 62 | true 63 | } 64 | """ 65 | 66 | Tmp1() 67 | -------------------------------------------------------------------------------- /pipelines/test.yaml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | trigger: ["*"] 4 | pr: ["*"] 5 | 6 | pool: 7 | vmImage: "ubuntu-22.04" 8 | 9 | jobs: 10 | - job: style_test 11 | strategy: 12 | matrix: 13 | Python3_12: 14 | python.version: "3.12" 15 | timeoutInMinutes: 10 16 | 17 | steps: 18 | - task: UsePythonVersion@0 19 | displayName: Set python version 20 | inputs: 21 | versionSpec: "$(python.version)" 22 | 23 | - script: | 24 | python -m pip install -U tox 25 | displayName: Install tox 26 | 27 | - script: | 28 | python -m tox -e style 29 | displayName: "Run style tests" 30 | 31 | - job: run_test 32 | strategy: 33 | matrix: 34 | Python3_8: 35 | python.version: "3.8" 36 | Python3_9: 37 | python.version: "3.9" 38 | Python3_10: 39 | python.version: "3.10" 40 | Python3_11: 41 | python.version: "3.11" 42 | Python3_12: 43 | python.version: "3.12" 44 | timeoutInMinutes: 10 45 | 46 | steps: 47 | - task: UsePythonVersion@0 48 | displayName: Set python version 49 | inputs: 50 | versionSpec: "$(python.version)" 51 | 52 | - script: | 53 | runtests=true 54 | if [ ! -d "$(pwd)/tests" ]; then 55 | echo "No tests found" 56 | runtest=false 57 | else 58 | python -m pip install -U tox 59 | fi 60 | echo "##vso[task.setvariable variable=runtests;]$runtests" 61 | displayName: Install tox 62 | 63 | - script: | 64 | python -m tox -e py 65 | displayName: "Run tests" 66 | condition: and(succeeded(), eq(variables.runtests, true)) 67 | -------------------------------------------------------------------------------- /demo_extractors/limit_other.py: -------------------------------------------------------------------------------- 1 | """Demo extractor to show the usage of the other field in the model.""" 2 | 3 | from io import BytesIO 4 | from typing import List, Optional 5 | 6 | from demo_extractors import shared 7 | from maco import extractor, model, yara 8 | 9 | 10 | class LimitOther(extractor.Extractor): 11 | """An example of how the 'other' dictionary can be limited in a custom way.""" 12 | 13 | family = "limit_other" 14 | author = "blue" 15 | last_modified = "2022-06-14" 16 | yara_rule = """ 17 | rule LimitOther 18 | { 19 | strings: 20 | $self_trigger = "LimitOther" 21 | 22 | condition: 23 | $self_trigger 24 | } 25 | """ 26 | 27 | def run(self, stream: BytesIO, matches: List[yara.Match]) -> Optional[model.ExtractorModel]: 28 | """Run the analysis process. 29 | 30 | Args: 31 | stream (BytesIO): file object from disk/network/memory. 32 | matches (List[yara.Match]): yara rule matches 33 | 34 | Returns: 35 | (Optional[model.ExtractorModel]): model of results 36 | 37 | Raises: 38 | Exception: if the httpx library is not installed 39 | 40 | """ 41 | # import httpx at runtime so we can test that requirements.txt is installed dynamically without breaking 42 | # the tests that do direct importing 43 | import httpx 44 | 45 | # use httpx so it doesn't get deleted by auto linter 46 | if not httpx.__name__: 47 | raise Exception("wow I really want to use this library in a useful way") 48 | 49 | # use a custom model that inherits from ExtractorModel 50 | # this model defines what can go in the 'other' dict 51 | tmp = shared.MyCustomModel(family="specify_other") 52 | tmp.campaign_id.append("12345") 53 | tmp.other = tmp.Other(key1="key1", key2=True, key3=45) 54 | return tmp 55 | -------------------------------------------------------------------------------- /pipelines/publish.yaml: -------------------------------------------------------------------------------- 1 | name: publish 2 | 3 | trigger: 4 | branches: 5 | exclude: 6 | - '*' 7 | tags: 8 | include: ["v*"] 9 | pr: none 10 | 11 | pool: 12 | vmImage: "ubuntu-22.04" 13 | 14 | jobs: 15 | - job: test 16 | displayName: Test 17 | strategy: 18 | matrix: 19 | Python38: 20 | python.version: '3.8' 21 | Python39: 22 | python.version: '3.9' 23 | Python310: 24 | python.version: '3.10' 25 | Python311: 26 | python.version: '3.11' 27 | Python312: 28 | python.version: '3.12' 29 | steps: 30 | - task: UsePythonVersion@0 31 | displayName: 'Use Python $(python.version)' 32 | inputs: 33 | versionSpec: '$(python.version)' 34 | 35 | - script: | 36 | set -x 37 | 38 | python -m pip install -U tox 39 | python -m tox -e py 40 | 41 | - job: build_and_deploy 42 | dependsOn: test 43 | displayName: Build and Deploy 44 | variables: 45 | - group: deployment-information 46 | 47 | steps: 48 | - task: UsePythonVersion@0 49 | displayName: 'Use Python 3.9' 50 | inputs: 51 | versionSpec: '3.9' 52 | 53 | - script: | 54 | set -x 55 | python -m pip install -U build 56 | python -m build 57 | ls dist 58 | displayName: Build (Full) 59 | 60 | - script: | 61 | set -x 62 | cd model_setup 63 | python -m build --outdir ../dist 64 | ls ../dist 65 | displayName: Build (Model Only) 66 | 67 | - script: | 68 | set -x 69 | cd extractor_setup 70 | python -m build --outdir ../dist 71 | ls ../dist 72 | displayName: Build (Extractor Essentials) 73 | 74 | - script: | 75 | set -xv # Echo commands before they are run 76 | sudo env "PATH=$PATH" python -m pip install --no-cache-dir twine 77 | ls dist 78 | twine upload --skip-existing dist/* 79 | displayName: Deploy to PyPI 80 | env: 81 | TWINE_USERNAME: $(twineUsername) 82 | TWINE_PASSWORD: $(twinePassword) 83 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "setuptools_scm>=8"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "maco" 7 | description = "Maco is a framework for creating and using malware configuration extractors." 8 | dynamic = ["version", "readme", "dependencies"] 9 | requires-python = ">=3.8" 10 | authors = [{ name = "sl-govau" }] 11 | maintainers = [{ name = "cccs-rs" }] 12 | license = { file = "LICENSE.md" } 13 | 14 | classifiers = [ 15 | "Development Status :: 5 - Production/Stable", 16 | "Intended Audience :: Developers", 17 | 18 | "Topic :: Software Development :: Libraries :: Python Modules", 19 | 20 | "License :: OSI Approved :: MIT License", 21 | 22 | "Programming Language :: Python :: 3.8", 23 | "Programming Language :: Python :: 3.9", 24 | "Programming Language :: Python :: 3.10", 25 | "Programming Language :: Python :: 3.11", 26 | "Programming Language :: Python :: 3.12", 27 | ] 28 | 29 | [project.scripts] 30 | maco = "maco.cli:main" 31 | 32 | [project.urls] 33 | Repository = "https://github.com/CybercentreCanada/Maco" 34 | Issues = "https://github.com/CybercentreCanada/Maco/issues" 35 | 36 | [tool.setuptools_scm] 37 | 38 | [tool.setuptools.dynamic] 39 | readme = { file = ["README.md"], content-type = "text/markdown" } 40 | dependencies = { file = ["requirements.txt"] } 41 | 42 | [tool.setuptools.packages.find] 43 | where = ["."] 44 | exclude = ["test", "tests", "extractors", "model_setup", "extractor_setup"] 45 | 46 | [tool.ruff] 47 | line-length = 120 48 | 49 | [tool.ruff.format] 50 | docstring-code-format = true 51 | 52 | [tool.ruff.lint] 53 | # Add the `line-too-long` rule to the enforced rule set. By default, Ruff omits rules that 54 | # overlap with the use of a formatter, like Black, but we can override this behavior by 55 | # explicitly adding the rule. 56 | extend-select = ["E501", "D", "DOC"] 57 | ignore = ["D104"] 58 | preview = true 59 | 60 | [tool.ruff.lint.pydocstyle] 61 | convention = "google" 62 | -------------------------------------------------------------------------------- /tests/test_demo_extractors.py: -------------------------------------------------------------------------------- 1 | """Test demo extractors.""" 2 | 3 | import os 4 | import unittest 5 | 6 | from maco import cli 7 | from maco.collector import Collector 8 | 9 | 10 | class TestDemoExtractors(unittest.TestCase): 11 | """Test demo extractors.""" 12 | 13 | def test_complex(self): 14 | """Test complex extractor.""" 15 | path_file = os.path.normpath(os.path.join(__file__, "../data/trigger_complex.txt")) 16 | collector = Collector(os.path.join(__file__, "../../demo_extractors")) 17 | self.assertEqual( 18 | set(collector.extractors.keys()), 19 | {"Elfy", "Nothing", "Complex", "LimitOther", "Terminator"}, 20 | ) 21 | 22 | with open(path_file, "rb") as stream: 23 | ret = cli.process_file( 24 | collector, 25 | path_file, 26 | stream, 27 | pretty=True, 28 | force=False, 29 | include_base64=False, 30 | ) 31 | self.assertEqual( 32 | ret, 33 | { 34 | "Complex": { 35 | "family": "complex", 36 | "version": "5", 37 | "decoded_strings": sorted(["Paradise", "Complex"]), 38 | "binaries": [ 39 | { 40 | "datatype": "payload", 41 | "encryption": {"algorithm": "something"}, 42 | "sha256": "1307990e6ba5ca145eb35e99182a9bec46531bc54ddf656a602c780fa0240dee", 43 | "size": 9, 44 | "hex_sample": "736F6D652064617461", 45 | } 46 | ], 47 | "http": [ 48 | { 49 | "protocol": "https", 50 | "hostname": "blarg5.com", 51 | "path": "/malz/64", 52 | "usage": "c2", 53 | } 54 | ], 55 | "encryption": [{"algorithm": "sha256"}], 56 | } 57 | }, 58 | ) 59 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | """Test helper functions.""" 2 | 3 | import io 4 | import os 5 | import unittest 6 | 7 | from maco import collector 8 | 9 | path_extractors = "../../demo_extractors" 10 | 11 | 12 | class TestHelpersFindExtractors(unittest.TestCase): 13 | """Test finding extractors.""" 14 | 15 | def test_find_extractors(self): 16 | """Test finding extractors.""" 17 | target = os.path.join(__file__, path_extractors) 18 | m = collector.Collector(target) 19 | # extractors = helpers.find_extractors(target) 20 | self.assertEqual(len(m.extractors), 4) 21 | self.assertEqual( 22 | {x for x in m.extractors.keys()}, 23 | {"Complex", "Elfy", "LimitOther", "Nothing"}, 24 | ) 25 | 26 | 27 | class TestHelpersCompileYara(unittest.TestCase): 28 | """Test YARA rule compilation.""" 29 | 30 | def test_compile_yara(self): 31 | """Test YARA rule compilation.""" 32 | target = os.path.join(__file__, path_extractors) 33 | m = collector.Collector(target) 34 | self.assertEqual( 35 | {x.identifier for x in m.rules}, 36 | {"Elfy", "Complex", "ComplexSubtext", "Nothing", "ComplexAlt", "LimitOther", "Terminator"}, 37 | ) 38 | 39 | 40 | class TestHelpersAnalyseStream(unittest.TestCase): 41 | """Test analyzing a stream.""" 42 | 43 | def setUp(self): 44 | """Setup.""" 45 | target = os.path.join(__file__, path_extractors) 46 | self.m = collector.Collector(target) 47 | 48 | def test_analyse_stream(self): 49 | """Test analyzing a stream.""" 50 | data = b"" 51 | resp = self.m.extract(io.BytesIO(data), "Complex") 52 | self.assertEqual(resp, None) 53 | 54 | data = b"data" 55 | resp = self.m.extract(io.BytesIO(data), "Complex") 56 | self.assertEqual( 57 | resp, 58 | { 59 | "family": "complex", 60 | "version": "5", 61 | "binaries": [ 62 | { 63 | "datatype": "payload", 64 | "data": b"some data", 65 | "encryption": {"algorithm": "something"}, 66 | } 67 | ], 68 | "http": [ 69 | { 70 | "protocol": "https", 71 | "hostname": "blarg5.com", 72 | "path": "/malz/4", 73 | "usage": "c2", 74 | } 75 | ], 76 | "encryption": [{"algorithm": "sha256"}], 77 | }, 78 | ) 79 | -------------------------------------------------------------------------------- /tests/test_parallelism.py: -------------------------------------------------------------------------------- 1 | """Test extractor loading and import rewriting when executed in parallel.""" 2 | 3 | import os 4 | 5 | from maco.collector import Collector 6 | import unittest 7 | 8 | 9 | class TestParallelism(unittest.TestCase): 10 | """Test parallel loading of maco extractors. 11 | 12 | This test only makes sense when run in parallel -- running a single instance will not test the affected areas. 13 | pytest-xdist needs to be installed to run these tests in parallel, use the -n flag to specify how many processes. 14 | 2 or 4 is a reasonable number for the four test cases here. 15 | python -m pytest tests/test_parallelism.py -n 2 16 | """ 17 | 18 | # determine path to test extractor 19 | working_dir = os.path.join(os.path.dirname(__file__), "extractors/import_rewriting") 20 | assert os.path.isdir(working_dir) 21 | 22 | # this value may need to be increased to ensure the errors occur, depending on your test system 23 | repetitions = 5 24 | 25 | def test_parallelism_1(self): 26 | """Test for one pytest-xdist worker.""" 27 | for _ in range(self.repetitions): 28 | collector = Collector(self.working_dir, create_venv=False) 29 | 30 | # if extractor isn't overwritten, extractor will load 31 | # otherwise this raises an ExtractorLoadError because the extractor file is empty 32 | self.assertListEqual(list(collector.extractors.keys()), ["Importer"]) 33 | 34 | def test_parallelism_2(self): 35 | """Test for one pytest-xdist worker.""" 36 | for _ in range(self.repetitions): 37 | collector = Collector(self.working_dir, create_venv=False) 38 | 39 | # if extractor isn't overwritten, extractor will load 40 | # otherwise this raises an ExtractorLoadError because the extractor file is empty 41 | self.assertListEqual(list(collector.extractors.keys()), ["Importer"]) 42 | 43 | def test_parallelism_3(self): 44 | """Test for one pytest-xdist worker.""" 45 | for _ in range(self.repetitions): 46 | collector = Collector(self.working_dir, create_venv=False) 47 | 48 | # if extractor isn't overwritten, extractor will load 49 | # otherwise this raises an ExtractorLoadError because the extractor file is empty 50 | self.assertListEqual(list(collector.extractors.keys()), ["Importer"]) 51 | 52 | def test_parallelism_4(self): 53 | """Test for one pytest-xdist worker.""" 54 | for _ in range(self.repetitions): 55 | collector = Collector(self.working_dir, create_venv=False) 56 | 57 | # if extractor isn't overwritten, extractor will load 58 | # otherwise this raises an ExtractorLoadError because the extractor file is empty 59 | self.assertListEqual(list(collector.extractors.keys()), ["Importer"]) 60 | -------------------------------------------------------------------------------- /demo_extractors/complex/complex.py: -------------------------------------------------------------------------------- 1 | """Demo complex extractor.""" 2 | 3 | from io import BytesIO 4 | from typing import List, Optional 5 | 6 | from demo_extractors.complex import complex_utils 7 | from maco import extractor, model, yara 8 | 9 | 10 | class Complex(extractor.Extractor): 11 | """This script has multiple yara rules and coverage of the data model.""" 12 | 13 | family = "complex" 14 | author = "blue" 15 | last_modified = "2022-06-14" 16 | yara_rule = """ 17 | private rule ComplexSubtext 18 | { 19 | strings: 20 | $self_trigger = "self_trigger" 21 | condition: 22 | $self_trigger 23 | } 24 | rule Complex 25 | { 26 | strings: 27 | $self_trigger = "Complex" 28 | $my_hex_string = { E2 34 A1 C8 23 FB } 29 | 30 | condition: 31 | ($self_trigger or $my_hex_string) and ComplexSubtext 32 | } 33 | rule ComplexAlt 34 | { 35 | strings: 36 | $self_trigger = "Paradise" 37 | 38 | condition: 39 | $self_trigger 40 | } 41 | """ 42 | 43 | def run(self, stream: BytesIO, matches: List[yara.Match]) -> Optional[model.ExtractorModel]: 44 | """Run the analysis process. 45 | 46 | Args: 47 | stream (BytesIO): file object from disk/network/memory. 48 | matches (List[yara.Match]): yara rule matches 49 | 50 | Returns: 51 | (Optional[model.ExtractorModel]): model of results 52 | 53 | """ 54 | self.logger.info("starting run") 55 | self.logger.debug(f"{[x.rule for x in matches]=}") 56 | data = stream.read() 57 | if not data: 58 | return 59 | # this is where you would do some processing on the file 60 | data_len = len(data) 61 | other = complex_utils.getdata()["result"] 62 | self.logger.debug("got data from lib") 63 | # example - accessing yara strings 64 | strings = sorted({z.plaintext().decode("utf8") for x in matches for y in x.strings for z in y.instances}) 65 | self.logger.debug(f"{strings=}") 66 | # construct model of results 67 | tmp = model.ExtractorModel(family=self.family) 68 | tmp.decoded_strings = strings 69 | tmp.version = "5" 70 | tmp.http.append( 71 | tmp.Http( 72 | protocol="https", 73 | hostname=f"blarg{other}.com", 74 | path=f"/malz/{data_len}", 75 | usage="c2", 76 | ) 77 | ) 78 | 79 | tmp.encryption.append(tmp.Encryption(algorithm="sha256")) 80 | tmp.binaries.append( 81 | tmp.Binary( 82 | data=b"some data", 83 | datatype=tmp.Binary.TypeEnum.payload, 84 | encryption=tmp.Binary.Encryption(algorithm="something"), 85 | ) 86 | ) 87 | return tmp 88 | -------------------------------------------------------------------------------- /maco/extractor.py: -------------------------------------------------------------------------------- 1 | """Base class for an extractor script.""" 2 | 3 | import logging 4 | import textwrap 5 | from typing import BinaryIO, List, Optional, Union 6 | 7 | from maco import model, yara 8 | from maco.exceptions import InvalidExtractor 9 | 10 | DEFAULT_YARA_RULE = """ 11 | rule {name} 12 | {{ 13 | condition: 14 | true 15 | }} 16 | """ 17 | 18 | 19 | class Extractor: 20 | """Base class for an analysis extractor with common entrypoint and metadata. 21 | 22 | Override this docstring with a good description of your extractor. 23 | """ 24 | 25 | family: Union[str, List[str]] = None # family or families of malware that is detected by the extractor 26 | author: str = None # author of the extractor (name@organisation) 27 | last_modified: str = None # last modified date (YYYY-MM-DD) 28 | sharing: str = "TLP:WHITE" # who can this be shared with? 29 | yara_rule: str = None # yara rule that we filter inputs with 30 | reference: str = None # link to malware report or other reference information 31 | logger: logging.Logger = None # logger for use when debugging 32 | 33 | def __init__(self) -> None: 34 | """Initialise the extractor. 35 | 36 | Raises: 37 | InvalidExtractor: When the extractor is invalid. 38 | """ 39 | self.name = name = type(self).__name__ 40 | self.logger = logging.getLogger(f"maco.extractor.{name}") 41 | self.logger.debug(f"initialise '{name}'") 42 | if not self.family or not self.author or not self.last_modified: 43 | raise InvalidExtractor("must set family, author, last_modified") 44 | # if author does not set a yara rule, match on everything 45 | if not self.yara_rule: 46 | self.yara_rule = DEFAULT_YARA_RULE.format(name=name) 47 | # unindent the yara rule from triple quoted string 48 | # this is for friendly printing, yara handles the rule ok either way 49 | self.yara_rule = textwrap.dedent(self.yara_rule) 50 | # check yara rules conform to expected structure 51 | # we throw away these compiled rules as we need all rules in system compiled together 52 | try: 53 | self.yara_compiled = yara.compile(source=self.yara_rule) 54 | except yara.SyntaxError as e: 55 | raise InvalidExtractor(f"{self.name} - invalid yara rule") from e 56 | # need to track which plugin owns the rules 57 | self.yara_rule_names = [x.identifier for x in self.yara_compiled] 58 | if not len(list(self.yara_compiled)): 59 | raise InvalidExtractor(f"{name} must define at least one yara rule") 60 | for x in self.yara_compiled: 61 | if x.is_global: 62 | raise InvalidExtractor(f"{x.identifier} yara rule must not be global") 63 | 64 | def run(self, stream: BinaryIO, matches: List[yara.Match]) -> Optional[model.ExtractorModel]: 65 | """Run the analysis process and return dict matching. 66 | 67 | :param stream: file object from disk/network/memory. 68 | :param match: yara rule match information contains locations of strings. 69 | """ 70 | raise NotImplementedError() 71 | -------------------------------------------------------------------------------- /maco/base_test.py: -------------------------------------------------------------------------------- 1 | """Foundation for unit testing an extractor. 2 | 3 | Example: 4 | from maco import base_test 5 | class TestExample(base_test.BaseTest): 6 | name = "Example" 7 | path = os.path.join(__file__, "../../extractors") 8 | def test_run(self): 9 | data = b"data with Example information" 10 | ret = self.extract(io.BytesIO(data)) 11 | self.assertEqual(ret["family"], "example") 12 | """ 13 | 14 | import importlib 15 | import io 16 | import os 17 | import unittest 18 | 19 | import cart 20 | 21 | from maco import collector 22 | from maco.exceptions import NoHitException 23 | 24 | 25 | class BaseTest(unittest.TestCase): 26 | """Base test class.""" 27 | 28 | name: str = None # name of the extractor 29 | # folder and/or file where extractor is. 30 | # I recommend something like os.path.join(__file__, "../../extractors") 31 | # if your extractors are in a folder 'extractors' next to a folder of tests 32 | path: str = None 33 | create_venv: bool = False 34 | 35 | @classmethod 36 | def setUpClass(cls) -> None: 37 | """Initialization of class. 38 | 39 | Raises: 40 | Exception: when name or path is not set. 41 | """ 42 | if not cls.name or not cls.path: 43 | raise Exception("name and path must be set") 44 | cls.c = collector.Collector(cls.path, include=[cls.name], create_venv=cls.create_venv) 45 | return super().setUpClass() 46 | 47 | def test_default_metadata(self): 48 | """Require extractor to be loadable and valid.""" 49 | self.assertIn(self.name, self.c.extractors) 50 | self.assertEqual(len(self.c.extractors), 1) 51 | 52 | def extract(self, stream): 53 | """Return results for running extractor over stream, including yara check. 54 | 55 | Raises: 56 | NoHitException: when yara rule doesn't hit. 57 | """ 58 | runs = self.c.match(stream) 59 | if not runs: 60 | raise NoHitException("no yara rule hit") 61 | resp = self.c.extract(stream, self.name) 62 | return resp 63 | 64 | @classmethod 65 | def _get_location(cls) -> str: 66 | """Return path to child class that implements this class.""" 67 | # import child module 68 | module = cls.__module__ 69 | i = importlib.import_module(module) 70 | # get location to child module 71 | return i.__file__ 72 | 73 | @classmethod 74 | def load_cart(cls, filepath: str) -> io.BytesIO: 75 | """Load and unneuter a test file (likely malware) into memory for processing. 76 | 77 | Args: 78 | filepath (str): Path to carted sample 79 | 80 | Returns: 81 | (io.BytesIO): Buffered stream containing the un-carted sample 82 | 83 | Raises: 84 | FileNotFoundError: if the path to the sample doesn't exist 85 | """ 86 | # it is nice if we can load files relative to whatever is implementing base_test 87 | dirpath = os.path.split(cls._get_location())[0] 88 | # either filepath is absolute, or should be loaded relative to child of base_test 89 | filepath = os.path.join(dirpath, filepath) 90 | if not os.path.isfile(filepath): 91 | raise FileNotFoundError(filepath) 92 | with open(filepath, "rb") as f: 93 | unpacked = io.BytesIO() 94 | # just bubble exceptions if it isn't cart 95 | cart.unpack_stream(f, unpacked) 96 | # seek to start of the unneutered stream 97 | unpacked.seek(0) 98 | return unpacked 99 | -------------------------------------------------------------------------------- /tests/test_base_test.py: -------------------------------------------------------------------------------- 1 | """Base testing.""" 2 | 3 | import io 4 | import os 5 | 6 | from demo_extractors.complex import complex, complex_utils 7 | from maco import base_test 8 | 9 | 10 | class TestLimitOther(base_test.BaseTest): 11 | """Test that limit_other extractor can be used in base environment.""" 12 | 13 | name = "LimitOther" 14 | path = os.path.join(__file__, "../../demo_extractors") 15 | 16 | def test_load_cart(self): 17 | """Test loading a cart file.""" 18 | data = self.load_cart("data/example.txt.cart").read() 19 | self.assertEqual(data, b"LimitOther\n") 20 | 21 | def test_extract(self): 22 | """Tests that we can run an extractor through maco.""" 23 | ret = self.extract(self.load_cart("data/example.txt.cart")) 24 | self.assertEqual(ret["family"], "specify_other") 25 | self.assertEqual(ret["campaign_id"], ["12345"]) 26 | 27 | 28 | class TestComplex(base_test.BaseTest): 29 | """Test that complex extractor can be used in base environment.""" 30 | 31 | name = "Complex" 32 | path = os.path.join(__file__, "../../demo_extractors") 33 | create_venv = False 34 | 35 | def test_extract(self): 36 | """Tests that we can run an extractor through maco.""" 37 | ret = self.extract(self.load_cart("data/trigger_complex.txt.cart")) 38 | self.assertEqual(ret["family"], "complex") 39 | self.assertEqual(ret["version"], "5") 40 | 41 | def test_subfunction(self): 42 | """Tests that we can import directly from the extractor module and run a function.""" 43 | self.assertEqual(complex_utils.getdata(), {"result": 5}) 44 | 45 | def test_manual_extract(self): 46 | """Tests that we can run an extractor through maco.""" 47 | ref = complex.Complex 48 | self.assertGreater(len(ref.yara_rule), 100) 49 | instance = complex.Complex() 50 | self.assertGreater(len(instance.yara_rule), 100) 51 | 52 | data = io.BytesIO(b"my malwarez") 53 | result = instance.run(data, []) 54 | self.assertEqual(result.family, "complex") 55 | 56 | 57 | class TestComplexVenv(base_test.BaseTest): 58 | """Test that complex extractor can be used in full venv isolation.""" 59 | 60 | name = "Complex" 61 | path = os.path.join(__file__, "../../demo_extractors") 62 | create_venv = True 63 | 64 | def test_extract(self): 65 | """Tests that we can run an extractor through maco.""" 66 | ret = self.extract(self.load_cart("data/trigger_complex.txt.cart")) 67 | self.assertEqual(ret["family"], "complex") 68 | self.assertEqual(ret["version"], "5") 69 | 70 | 71 | class TestTerminator(base_test.BaseTest): 72 | """Test that terminator extractor can be used in base environment.""" 73 | 74 | name = "Terminator" 75 | path = os.path.join(__file__, "../../demo_extractors") 76 | create_venv = False 77 | 78 | def test_extract(self): 79 | """Tests that we can run an extractor through maco.""" 80 | ret = self.extract(self.load_cart("data/trigger_complex.txt.cart")) 81 | self.assertEqual(ret, None) 82 | 83 | 84 | class TestTerminatorVenv(base_test.BaseTest): 85 | """Test that terminator extractor can be used in base environment.""" 86 | 87 | name = "Terminator" 88 | path = os.path.join(__file__, "../../demo_extractors") 89 | create_venv = True 90 | 91 | def test_extract(self): 92 | """Tests that we can run an extractor through maco.""" 93 | ret = self.extract(self.load_cart("data/trigger_complex.txt.cart")) 94 | self.assertEqual(ret, None) 95 | -------------------------------------------------------------------------------- /tests/test_detection.py: -------------------------------------------------------------------------------- 1 | """Test detection of extractors.""" 2 | 3 | import os 4 | import sys 5 | 6 | import pytest 7 | 8 | from maco.collector import Collector 9 | 10 | INIT_MODULES = list(sys.modules.keys()) 11 | TESTS_DIR = os.path.dirname(__file__) 12 | 13 | CAPE_EXTRACTORS = [ 14 | "AgentTesla", 15 | "AsyncRAT", 16 | "AuroraStealer", 17 | "Azorult", 18 | "BitPaymer", 19 | "BlackDropper", 20 | "Blister", 21 | "BruteRatel", 22 | "BumbleBee", 23 | "Carbanak", 24 | "CobaltStrikeBeacon", 25 | "CobaltStrikeStager", 26 | "DCRat", 27 | "DarkGate", 28 | "DoppelPaymer", 29 | "DridexLoader", 30 | "Fareit", 31 | "Formbook", 32 | "GuLoader", 33 | "IcedID", 34 | "IcedIDLoader", 35 | "KoiLoader", 36 | "Latrodectus", 37 | "LokiBot", 38 | "Lumma", 39 | "NanoCore", 40 | "Nighthawk", 41 | "Njrat", 42 | "Oyster", 43 | "PhemedroneStealer", 44 | "PikaBot", 45 | "PlugX", 46 | "QakBot", 47 | "QuasarRAT", 48 | "Quickbind", 49 | "RedLine", 50 | "Remcos", 51 | "Rhadamanthys", 52 | "SmokeLoader", 53 | "Socks5Systemz", 54 | "SparkRAT", 55 | "SquirrelWaffle", 56 | "Stealc", 57 | "Strrat", 58 | "VenomRAT", 59 | "WarzoneRAT", 60 | "XWorm", 61 | "XenoRAT", 62 | "Zloader", 63 | ] 64 | 65 | 66 | @pytest.mark.parametrize( 67 | "repository_url, extractors, python_minor, branch", 68 | [ 69 | ("https://github.com/jeFF0Falltrades/rat_king_parser", ["RKPMACO"], 10, None), 70 | ("https://github.com/CAPESandbox/community", CAPE_EXTRACTORS, 10, None), 71 | ], 72 | ids=("jeFF0Falltrades/rat_king_parser", "CAPESandbox/community"), 73 | ) 74 | def test_public_projects(repository_url: str, extractors: list, python_minor: int, branch: str): 75 | """Test compatibility with public projects.""" 76 | # Ensure that any changes we make doesn't break usage of public projects 77 | # which can affect downstream systems using like library (ie. Assemblyline) 78 | import sys 79 | from tempfile import TemporaryDirectory 80 | 81 | from git import Repo 82 | 83 | if sys.version_info >= (3, python_minor): 84 | with TemporaryDirectory() as working_dir: 85 | project_name = repository_url.rsplit("/", 1)[1] 86 | extractor_dir = os.path.join(working_dir, project_name) 87 | Repo.clone_from(repository_url, extractor_dir, depth=1, branch=branch) 88 | 89 | collector = Collector(extractor_dir, create_venv=True) 90 | assert set(extractors) == set(collector.extractors.keys()) 91 | 92 | else: 93 | pytest.skip("Unsupported Python version") 94 | 95 | 96 | def test_module_confusion(): 97 | """Test module confusion.""" 98 | import shutil 99 | from tempfile import TemporaryDirectory 100 | 101 | import git 102 | 103 | # ensure that the git import is kept 104 | assert git.__name__ 105 | 106 | # Directories that have the same name as the Python module, shouldn't cause confusion on loading the right module 107 | collector = Collector(os.path.join(__file__, "../extractors/bob")) 108 | assert collector.extractors["Bob"] 109 | 110 | collector = Collector(os.path.join(__file__, "../extractors")) 111 | assert collector.extractors["Bob"] 112 | 113 | # Existing packages shouldn't interfere with loading extractors from directories with similar names 114 | with TemporaryDirectory() as ex_copy: 115 | copy_ex_dir = f"{ex_copy}/git" 116 | shutil.copytree(f"{TESTS_DIR}/extractors", copy_ex_dir, dirs_exist_ok=True) 117 | collector = Collector(copy_ex_dir) 118 | assert collector.extractors["Bob"] and os.path.exists(collector.extractors["Bob"]["module_path"]) 119 | -------------------------------------------------------------------------------- /tests/benchmark.py: -------------------------------------------------------------------------------- 1 | """Benchmarking tests.""" 2 | 3 | import os 4 | import timeit 5 | 6 | from demo_extractors.complex import complex 7 | from maco import base_test 8 | 9 | # instance of extractor for synthetic comparison to maco 10 | instance = complex.Complex() 11 | 12 | 13 | class LocalBaseTest(base_test.BaseTest): 14 | """Local base test.""" 15 | 16 | name = "Complex" 17 | path = os.path.join(__file__, "../../demo_extractors") 18 | create_venv = False 19 | 20 | @classmethod 21 | def setUpClass(cls) -> None: 22 | """Setup class.""" 23 | super().setUpClass() 24 | cls.input_file = cls.load_cart("data/trigger_complex.txt.cart") 25 | cls.input_file.seek(0) 26 | 27 | 28 | class TestComplexSynthetic(LocalBaseTest): 29 | """Test extractors work bypassing maco.""" 30 | 31 | def test_extract(self): 32 | """Test extraction.""" 33 | self.input_file.seek(0) 34 | raw = self.input_file.read() 35 | self.input_file.seek(0) 36 | # run yara rules against sample 37 | matches = instance.yara_compiled.match(data=raw) 38 | self.assertEqual(len(matches), 2) 39 | result = instance.run(self.input_file, []) 40 | self.assertEqual(result.family, "complex") 41 | 42 | 43 | class TestComplexNoVenv(LocalBaseTest): 44 | """Test extractors work without full venv isolation.""" 45 | 46 | def test_extract(self): 47 | """Test extraction without a virtual environment.""" 48 | self.input_file.seek(0) 49 | ret = self.extract(self.input_file) 50 | self.assertEqual(ret["family"], "complex") 51 | self.assertEqual(ret["version"], "5") 52 | 53 | 54 | class TestComplexVenv(LocalBaseTest): 55 | """Test extractors work when run with virtual environments.""" 56 | 57 | create_venv = True 58 | 59 | def test_extract(self): 60 | """Test extraction with a virtual environment.""" 61 | self.input_file.seek(0) 62 | ret = self.extract(self.input_file) 63 | self.assertEqual(ret["family"], "complex") 64 | self.assertEqual(ret["version"], "5") 65 | 66 | 67 | def make_synthetic(): 68 | """Make synthetic test. 69 | 70 | Returns: 71 | SyntheticTest 72 | """ 73 | TestComplexSynthetic.setUpClass() 74 | tc = TestComplexSynthetic() 75 | tc.setUp() 76 | return tc 77 | 78 | 79 | def make_no_venv(): 80 | """Make no venv test. 81 | 82 | Returns: 83 | Test without virtual environment isolation 84 | """ 85 | TestComplexNoVenv.setUpClass() 86 | tc = TestComplexNoVenv() 87 | tc.setUp() 88 | return tc 89 | 90 | 91 | def make_venv(): 92 | """Make venv test. 93 | 94 | Returns: 95 | Test with virtual environment isolation 96 | """ 97 | TestComplexVenv.setUpClass() 98 | tc = TestComplexVenv() 99 | tc.setUp() 100 | return tc 101 | 102 | 103 | if __name__ == "__main__": 104 | trials = 1000 105 | print(f"num trials: {trials}") 106 | print("results are number of seconds to execute total number of trials") 107 | print("synthetic comparison (directly import and execute extractor)") 108 | print( 109 | timeit.timeit( 110 | "tc.test_extract()", 111 | setup="from __main__ import make_synthetic; tc=make_synthetic()", 112 | number=trials, 113 | ) 114 | ) 115 | print("maco no venv isolation") 116 | print( 117 | timeit.timeit( 118 | "tc.test_extract()", 119 | setup="from __main__ import make_no_venv; tc=make_no_venv()", 120 | number=trials, 121 | ) 122 | ) 123 | print("maco venv isolation") 124 | print( 125 | timeit.timeit( 126 | "tc.test_extract()", 127 | setup="from __main__ import make_venv; tc=make_venv()", 128 | number=trials, 129 | ) 130 | ) 131 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | ### VisualStudioCode ### 163 | .vscode/* 164 | !.vscode/settings.json 165 | !.vscode/tasks.json 166 | !.vscode/launch.json 167 | !.vscode/extensions.json 168 | !.vscode/*.code-snippets 169 | 170 | # Local History for Visual Studio Code 171 | .history/ 172 | 173 | # Built Visual Studio Code Extensions 174 | *.vsix 175 | 176 | ### VisualStudioCode Patch ### 177 | # Ignore all local history of files 178 | .history 179 | .ionide 180 | -------------------------------------------------------------------------------- /maco/yara.py: -------------------------------------------------------------------------------- 1 | """yara-python facade that uses yara-x.""" 2 | 3 | import re 4 | from collections import namedtuple 5 | from itertools import cycle 6 | from typing import Dict, List, Union 7 | 8 | import yara_x 9 | 10 | from maco.exceptions import SyntaxError 11 | 12 | RULE_ID_RE = re.compile("(\w+)? ?rule (\w+)") 13 | 14 | 15 | # Create interfaces that resembles yara-python (but is running yara-x under the hood) 16 | class StringMatchInstance: 17 | """Instance of a string match.""" 18 | 19 | def __init__(self, match: yara_x.Match, file_content: bytes): 20 | """Initializes StringMatchInstance.""" 21 | self.matched_data = file_content[match.offset : match.offset + match.length] 22 | self.matched_length = match.length 23 | self.offset = match.offset 24 | self.xor_key = match.xor_key 25 | 26 | def plaintext(self) -> bytes: 27 | """Plaintext of the matched data. 28 | 29 | Returns: 30 | (bytes): Plaintext of the matched cipher text 31 | """ 32 | if not self.xor_key: 33 | # No need to XOR the matched data 34 | return self.matched_data 35 | else: 36 | return bytes(c ^ k for c, k in zip(self.matched_data, cycle(self.xor_key))) 37 | 38 | 39 | class StringMatch: 40 | """String match.""" 41 | 42 | def __init__(self, pattern: yara_x.Pattern, file_content: bytes): 43 | """Initializes StringMatch.""" 44 | self.identifier = pattern.identifier 45 | self.instances = [StringMatchInstance(match, file_content) for match in pattern.matches] 46 | self._is_xor = any([match.xor_key for match in pattern.matches]) 47 | 48 | def is_xor(self): 49 | """Checks if string match is xor'd. 50 | 51 | Returns: 52 | (bool): True if match is xor'd 53 | """ 54 | return self._is_xor 55 | 56 | 57 | class Match: 58 | """Match.""" 59 | 60 | def __init__(self, rule: yara_x.Rule, file_content: bytes): 61 | """Initializes Match.""" 62 | self.rule = rule.identifier 63 | self.namespace = rule.namespace 64 | self.tags = list(rule.tags) or [] 65 | self.meta = dict() 66 | # Ensure metadata doesn't get overwritten 67 | for k, v in rule.metadata: 68 | self.meta.setdefault(k, []).append(v) 69 | self.strings = [StringMatch(pattern, file_content) for pattern in rule.patterns] 70 | 71 | 72 | class Rules: 73 | """Rules.""" 74 | 75 | def __init__(self, source: str = None, sources: Dict[str, str] = None): 76 | """Initializes Rules. 77 | 78 | Raises: 79 | SyntaxError: Raised when there's a syntax error in the YARA rule. 80 | """ 81 | Rule = namedtuple("Rule", "identifier namespace is_global") 82 | if source: 83 | sources = {"default": source} 84 | 85 | try: 86 | self._rules = [] 87 | compiler = yara_x.Compiler(relaxed_re_syntax=True) 88 | for namespace, source in sources.items(): 89 | compiler.new_namespace(namespace) 90 | for rule_type, id in RULE_ID_RE.findall(source): 91 | is_global = True if rule_type == "global" else False 92 | self._rules.append(Rule(namespace=namespace, identifier=id, is_global=is_global)) 93 | compiler.add_source(source) 94 | self.scanner = yara_x.Scanner(compiler.build()) 95 | except yara_x.CompileError as e: 96 | raise SyntaxError(e) 97 | 98 | def __iter__(self): 99 | """Iterate over rules. 100 | 101 | Yields: 102 | YARA rules 103 | """ 104 | for rule in self._rules: 105 | yield rule 106 | 107 | def match(self, filepath: str = None, data: Union[bytes, bytearray] = None) -> List[Match]: 108 | """Performs a scan to check for YARA rules matches based on the file, either given by path or buffer. 109 | 110 | Returns: 111 | (List[Match]): A list of YARA matches. 112 | """ 113 | if filepath: 114 | with open(filepath, "rb") as fp: 115 | data = fp.read() 116 | 117 | if isinstance(data, bytearray): 118 | data = bytes(data) 119 | 120 | return [Match(m, data) for m in self.scanner.scan(data).matching_rules] 121 | 122 | 123 | def compile(source: str = None, sources: Dict[str, str] = None) -> Rules: 124 | """Compiles YARA rules from source or from sources. 125 | 126 | Returns: 127 | (Rules): a Rules object 128 | """ 129 | return Rules(source, sources) 130 | -------------------------------------------------------------------------------- /maco/collector.py: -------------------------------------------------------------------------------- 1 | """Convenience functions for discovering your extractors.""" 2 | 3 | import inspect 4 | import logging 5 | import logging.handlers 6 | import os 7 | import sys 8 | from tempfile import NamedTemporaryFile 9 | from types import ModuleType 10 | from typing import Any, BinaryIO, Dict, List, TypedDict, Union 11 | 12 | from multiprocess import Manager, Process, Queue 13 | from pydantic import BaseModel 14 | 15 | from maco import extractor, model, utils, yara 16 | from maco.exceptions import AnalysisAbortedException, ExtractorLoadError 17 | 18 | logger = logging.getLogger("maco.lib.helpers") 19 | 20 | 21 | def _verify_response(resp: Union[BaseModel, dict]) -> Dict: 22 | """Enforce types and verify properties, and remove defaults. 23 | 24 | Args: 25 | resp (Union[BaseModel, dict])): results from extractor 26 | 27 | Returns: 28 | (Dict): results from extractor after verification 29 | """ 30 | if not resp: 31 | return None 32 | # check the response is valid for its own model 33 | # this is useful if a restriction on the 'other' dictionary is needed 34 | resp_model = type(resp) 35 | if resp_model != model.ExtractorModel and hasattr(resp_model, "model_validate"): 36 | resp = resp_model.model_validate(resp) 37 | # check the response is valid according to the ExtractorModel 38 | resp = model.ExtractorModel.model_validate(resp) 39 | # coerce sets to correct types 40 | # otherwise we end up with sets where we expect lists 41 | resp = model.ExtractorModel(**resp.model_dump()) 42 | # dump model to dict 43 | return resp.model_dump(exclude_defaults=True) 44 | 45 | 46 | class ExtractorMetadata(TypedDict): 47 | """Extractor-supplied metadata.""" 48 | 49 | author: str 50 | family: str 51 | last_modified: str 52 | sharing: str 53 | description: str 54 | 55 | 56 | class ExtractorRegistration(TypedDict): 57 | """Registration collected by the collector for a single extractor.""" 58 | 59 | venv: str 60 | module_path: str 61 | module_name: str 62 | extractor_class: str 63 | metadata: ExtractorMetadata 64 | 65 | 66 | class Collector: 67 | """Discover and load extractors from file system.""" 68 | 69 | def __init__( 70 | self, 71 | path_extractors: str, 72 | include: List[str] = None, 73 | exclude: List[str] = None, 74 | create_venv: bool = False, 75 | skip_install: bool = False, 76 | ): 77 | """Discover and load extractors from file system. 78 | 79 | Raises: 80 | ExtractorLoadError: when no extractors are found 81 | """ 82 | # maco requires the extractor to be imported directly, so ensure they are available on the path 83 | full_path_extractors = os.path.abspath(path_extractors) 84 | full_path_above_extractors = os.path.dirname(full_path_extractors) 85 | # Modify the PATH so we can recognize this new package on import 86 | if full_path_extractors not in sys.path: 87 | sys.path.insert(1, full_path_extractors) 88 | if full_path_above_extractors not in sys.path: 89 | sys.path.insert(1, full_path_above_extractors) 90 | 91 | path_extractors = os.path.realpath(path_extractors) 92 | self.path: str = path_extractors 93 | self.extractors: Dict[str, ExtractorRegistration] = {} 94 | 95 | with Manager() as manager: 96 | extractors = manager.dict() 97 | namespaced_rules = manager.dict() 98 | 99 | def extractor_module_callback(module: ModuleType, venv: str): 100 | members = inspect.getmembers(module, predicate=utils.maco_extractor_validation) 101 | for member in members: 102 | name, member = member 103 | if exclude and name in exclude: 104 | # Module is part of the exclusion list, skip 105 | logger.debug(f"exclude excluded '{name}'") 106 | return 107 | 108 | if include and name not in include: 109 | # Module wasn't part of the inclusion list, skip 110 | logger.debug(f"include excluded '{name}'") 111 | return 112 | 113 | # initialise and register 114 | logger.debug(f"register '{name}'") 115 | extractors[name] = dict( 116 | venv=venv, 117 | module_path=module.__file__, 118 | module_name=member.__module__, 119 | extractor_class=member.__name__, 120 | metadata={ 121 | "family": member.family, 122 | "author": member.author, 123 | "last_modified": member.last_modified, 124 | "sharing": member.sharing, 125 | "description": member.__doc__, 126 | }, 127 | ) 128 | namespaced_rules[name] = member.yara_rule or extractor.DEFAULT_YARA_RULE.format(name=name) 129 | 130 | # multiprocess logging is awkward - set up a queue to ensure we can log 131 | logging_queue = Queue() 132 | queue_handler = logging.handlers.QueueListener(logging_queue, *logging.getLogger().handlers) 133 | queue_handler.start() 134 | 135 | # Find the extractors within the given directory 136 | # Execute within a child process to ensure main process interpreter is kept clean 137 | p = Process( 138 | target=utils.proxy_logging, 139 | args=( 140 | logging_queue, 141 | utils.import_extractors, 142 | extractor_module_callback, 143 | ), 144 | kwargs=dict( 145 | root_directory=path_extractors, 146 | scanner=yara.compile(source=utils.MACO_YARA_RULE), 147 | create_venv=create_venv and os.path.isdir(path_extractors), 148 | skip_install=skip_install, 149 | ), 150 | ) 151 | p.start() 152 | p.join() 153 | 154 | # stop multiprocess logging 155 | queue_handler.stop() 156 | logging_queue.close() 157 | 158 | self.extractors = dict(extractors) 159 | if not self.extractors: 160 | raise ExtractorLoadError("no extractors were loaded") 161 | logger.debug(f"found extractors {list(self.extractors.keys())}\n") 162 | 163 | # compile yara rules gathered from extractors 164 | self.rules = yara.compile(sources=dict(namespaced_rules)) 165 | 166 | def match(self, stream: BinaryIO) -> Dict[str, List[yara.Match]]: 167 | """Return extractors that should run based on yara rules.""" 168 | # execute yara rules on file to find extractors we should run 169 | # yara can't run on a stream so we give it a bytestring 170 | matches = self.rules.match(data=stream.read()) 171 | stream.seek(0) 172 | if not matches: 173 | return 174 | # get all rules that hit for each extractor 175 | runs = {} 176 | for match in matches: 177 | runs.setdefault(match.namespace, []).append(match) 178 | 179 | return runs 180 | 181 | def extract( 182 | self, 183 | stream: BinaryIO, 184 | extractor_name: str, 185 | ) -> Dict[str, Any]: 186 | """Run extractor with stream and verify output matches the model. 187 | 188 | Args: 189 | stream (BinaryIO): Binary stream to analyze 190 | extractor_name (str): Name of extractor to analyze stream 191 | 192 | Returns: 193 | (Dict[str, Any]): Results from extractor 194 | """ 195 | extractor = self.extractors[extractor_name] 196 | try: 197 | # Run extractor on a copy of the sample 198 | with NamedTemporaryFile() as sample_path: 199 | sample_path.write(stream.read()) 200 | sample_path.flush() 201 | # enforce types and verify properties, and remove defaults 202 | return _verify_response( 203 | utils.run_extractor( 204 | sample_path.name, 205 | module_name=extractor["module_name"], 206 | extractor_class=extractor["extractor_class"], 207 | module_path=extractor["module_path"], 208 | venv=extractor["venv"], 209 | ) 210 | ) 211 | except AnalysisAbortedException: 212 | # Extractor voluntarily aborted analysis of sample 213 | return 214 | except Exception: 215 | # caller can deal with the exception 216 | raise 217 | finally: 218 | # make sure to reset where we are in the file 219 | # otherwise follow on extractors are going to read 0 bytes 220 | stream.seek(0) 221 | -------------------------------------------------------------------------------- /maco/cli.py: -------------------------------------------------------------------------------- 1 | """CLI example of how extractors can be executed.""" 2 | 3 | import argparse 4 | import base64 5 | import binascii 6 | import hashlib 7 | import io 8 | import json 9 | import logging 10 | import os 11 | import sys 12 | from importlib.metadata import version 13 | from typing import BinaryIO, List, Tuple 14 | 15 | import cart 16 | 17 | from maco import collector 18 | 19 | logger = logging.getLogger("maco.lib.cli") 20 | 21 | 22 | def process_file( 23 | collected: collector.Collector, 24 | path_file: str, 25 | stream: BinaryIO, 26 | *, 27 | pretty: bool, 28 | force: bool, 29 | include_base64: bool, 30 | ): 31 | """Process a filestream with the extractors and rules. 32 | 33 | Args: 34 | collected (collector.Collector): a Collector instance 35 | path_file (str): path to sample to be analyzed 36 | stream (BinaryIO): binary stream to be analyzed 37 | pretty (bool): Pretty print the JSON output 38 | force (bool): Run all extractors regardless of YARA rule match 39 | include_base64 (bool): include base64'd data in output 40 | 41 | Returns: 42 | (dict): The output from the extractors analyzing the sample 43 | 44 | """ 45 | unneutered = io.BytesIO() 46 | try: 47 | cart.unpack_stream(stream, unneutered) 48 | except Exception: 49 | # use original stream if anything goes wrong here 50 | # i.e. invalid/malformed cart 51 | pass 52 | else: 53 | # use unneutered stream 54 | stream = unneutered 55 | # unpack will read some bytes either way so reset position 56 | stream.seek(0) 57 | 58 | # find extractors that should run based on yara rules 59 | if not force: 60 | runs = collected.match(stream) 61 | else: 62 | # execute all extractors with no yara information 63 | # note - extractors may rely on a yara hit so this may cause errors 64 | runs = {x: [] for x in collected.extractors.keys()} 65 | if not runs: 66 | return 67 | 68 | # run extractor for the set of hits 69 | logger.info(f"path: {path_file}") 70 | ret = {} 71 | for extractor_name, hits in runs.items(): 72 | # run and store results for extractor 73 | logger.info(f"run {extractor_name} extractor from rules {[x.rule for x in hits]}") 74 | try: 75 | resp = collected.extract(stream, extractor_name) 76 | except Exception as e: 77 | logger.exception(f"extractor error with {path_file} ({e})") 78 | resp = None 79 | # encode binary data so we can print as json 80 | if resp: 81 | for row in resp.get("binaries", []): 82 | row["sha256"] = hashlib.sha256(row["data"]).hexdigest() 83 | # number of bytes in the binary 84 | row["size"] = len(row["data"]) 85 | # small sample of first part of binary 86 | row["hex_sample"] = binascii.hexlify(row["data"][:32]).decode("utf8").upper() 87 | if include_base64: 88 | # this can be large 89 | row["base64"] = base64.b64encode(row["data"]).decode("utf8") 90 | # do not print raw bytes to console 91 | row.pop("data") 92 | ret[extractor_name] = resp 93 | logger.info(json.dumps(resp, indent=2 if pretty else None)) 94 | logger.info("") 95 | 96 | return ret 97 | 98 | 99 | def process_filesystem( 100 | path_extractors: str, 101 | path_samples: str, 102 | include: List[str], 103 | exclude: List[str], 104 | *, 105 | pretty: bool, 106 | force: bool, 107 | include_base64: bool, 108 | create_venv: bool = False, 109 | skip_install: bool = False, 110 | ) -> Tuple[int, int, int]: 111 | """Process filesystem with extractors and print results of extraction. 112 | 113 | Returns: 114 | (Tuple[int, int, int]): Total number of analysed files, yara hits and successful maco extractions. 115 | """ 116 | if force: 117 | logger.warning("force execute will cause errors if an extractor requires a yara rule hit during execution") 118 | collected = collector.Collector( 119 | path_extractors, include=include, exclude=exclude, create_venv=create_venv, skip_install=skip_install 120 | ) 121 | 122 | logger.info(f"extractors loaded: {[x for x in collected.extractors.keys()]}\n") 123 | for _, extractor in collected.extractors.items(): 124 | extractor_meta = extractor["metadata"] 125 | logger.info( 126 | f"{extractor_meta['family']} by {extractor_meta['author']}" 127 | f" {extractor_meta['last_modified']} {extractor_meta['sharing']}" 128 | f"\n{extractor_meta['description']}\n" 129 | ) 130 | 131 | num_analysed = 0 132 | num_hits = 0 133 | num_extracted = 0 134 | if os.path.isfile(path_samples): 135 | # analyse a single file 136 | walker = [("", None, [path_samples])] 137 | elif os.path.isdir(path_samples): 138 | # load files from directory tree 139 | walker = os.walk(path_samples) 140 | else: 141 | logger.error(f"not file or folder: {path_samples}") 142 | exit(2) 143 | try: 144 | base_directory = os.path.abspath(path_samples) 145 | for path, _, files in walker: 146 | for file in files: 147 | num_analysed += 1 148 | path_file = os.path.abspath(os.path.join(path, file)) 149 | if not path_file.startswith(base_directory): 150 | logger.error(f"Attempted path traversal detected: {path_file}") 151 | continue 152 | 153 | try: 154 | with open(path_file, "rb") as stream: 155 | resp = process_file( 156 | collected, 157 | path_file, 158 | stream, 159 | pretty=pretty, 160 | force=force, 161 | include_base64=include_base64, 162 | ) 163 | if resp: 164 | num_hits += 1 165 | if any(x for x in resp.values()): 166 | num_extracted += 1 167 | except Exception as e: 168 | logger.exception(f"file error with {path_file} ({e})") 169 | continue 170 | except: 171 | raise 172 | finally: 173 | logger.info("") 174 | logger.info(f"{num_analysed} analysed, {num_hits} hits, {num_extracted} extracted") 175 | return num_analysed, num_hits, num_extracted 176 | 177 | 178 | def main(): 179 | """Main block for CLI.""" 180 | parser = argparse.ArgumentParser(description="Run extractors over samples.") 181 | parser.add_argument("extractors", type=str, help="path to extractors") 182 | parser.add_argument("samples", type=str, help="path to samples") 183 | parser.add_argument( 184 | "-v", 185 | "--verbose", 186 | action="count", 187 | default=0, 188 | help="print debug logging. -v extractor info, -vv extractor debug, -vvv cli debug", 189 | ) 190 | parser.add_argument("--pretty", action="store_true", help="pretty print json output") 191 | parser.add_argument( 192 | "--base64", 193 | action="store_true", 194 | help="Include base64 encoded binary data in output " 195 | "(can be large, consider printing to file rather than console)", 196 | ) 197 | parser.add_argument("--logfile", type=str, help="file to log output") 198 | parser.add_argument("--include", type=str, help="comma separated extractors to run") 199 | parser.add_argument("--exclude", type=str, help="comma separated extractors to not run") 200 | parser.add_argument( 201 | "-f", 202 | "--force", 203 | action="store_true", 204 | help="ignore yara rules and execute all extractors", 205 | ) 206 | parser.add_argument( 207 | "--create_venv", 208 | action="store_true", 209 | help="Creates venvs for every requirements.txt found (only applies when extractor path is a directory). " 210 | "This runs much slower than the alternative but may be necessary " 211 | "when there are many extractors with conflicting dependencies.", 212 | ) 213 | parser.add_argument( 214 | "--force_install", 215 | action="store_true", 216 | help="Force installation of Python dependencies for extractors (in both host and virtual environments).", 217 | ) 218 | parser.add_argument( 219 | "--version", 220 | action="version", 221 | version=f"version: {version('maco')}", 222 | help="Show version of MACO", 223 | ) 224 | 225 | args = parser.parse_args() 226 | inc = args.include.split(",") if args.include else [] 227 | exc = args.exclude.split(",") if args.exclude else [] 228 | 229 | # set up logging for lib, only show debug with 3+ verbose 230 | logger_lib = logging.getLogger("maco.lib") 231 | logger_lib.setLevel(logging.DEBUG if args.verbose > 2 else logging.INFO) 232 | ch = logging.StreamHandler(sys.stdout) 233 | ch.setLevel(logging.DEBUG) 234 | logger_lib.addHandler(ch) 235 | 236 | # set up logging for extractor 237 | logger_ex = logging.getLogger("maco.extractor") 238 | if args.verbose == 0: 239 | logger_ex.setLevel(logging.WARNING) 240 | elif args.verbose == 1: 241 | logger_ex.setLevel(logging.INFO) 242 | else: 243 | logger_ex.setLevel(logging.DEBUG) 244 | ch = logging.StreamHandler(sys.stdout) 245 | ch.setLevel(logging.DEBUG) 246 | formatter = logging.Formatter( 247 | fmt="%(asctime)s, [%(levelname)s] %(module)s.%(funcName)s: %(message)s", datefmt="%Y-%m-%d (%H:%M:%S)" 248 | ) 249 | ch.setFormatter(formatter) 250 | logger_ex.addHandler(ch) 251 | 252 | # log everything to file 253 | if args.logfile: 254 | logger = logging.getLogger("maco") 255 | logger_lib.setLevel(logging.DEBUG) 256 | fh = logging.FileHandler(args.logfile) 257 | fh.setLevel(logging.DEBUG) 258 | fh.setFormatter(formatter) 259 | logger.addHandler(fh) 260 | 261 | process_filesystem( 262 | args.extractors, 263 | args.samples, 264 | inc, 265 | exc, 266 | pretty=args.pretty, 267 | force=args.force, 268 | include_base64=args.base64, 269 | create_venv=args.create_venv, 270 | skip_install=not args.force_install, 271 | ) 272 | 273 | 274 | if __name__ == "__main__": 275 | main() 276 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Maco - Malware config extractor framework 2 | 3 | ## Maco is a framework for malware config extractors. 4 | 5 | It aims to solve two problems: 6 | 7 | - Define a standardize ontology (or model) for extractor output. This greatly helps for databasing extracted values. 8 | - Provide a standard way of identifying which parsers to run and how to execute them. 9 | 10 | ## Maco components 11 | 12 | - `model.py` 13 | - A data model for the common output of an extractor 14 | - `extractor.py` 15 | - Base class for extractors to implement 16 | - `collector.py` 17 | - Utilities for loading and running extractors 18 | - `cli.py` 19 | - A CLI tool `maco` to assist with running your extractors locally 20 | - `base_test.py` 21 | - Assist with writing unit tests for your extractors 22 | 23 | **Note: If you're interested in using only the model in your project, you can `pip install maco-model` which is a smaller package containing only the model definition** 24 | 25 | ## Project Integrations 🛠️ 26 | 27 | This framework is actively being used by: 28 | 29 | | Project | Description | License | 30 | | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------: | 31 | | | A malware analysis platform that uses the MACO model to export malware configuration extractions into a parseable, machine-friendly format | [![License](https://img.shields.io/github/license/CybercentreCanada/assemblyline)](https://github.com/CybercentreCanada/assemblyline/blob/main/LICENSE.md) | 32 | | [configextractor-py](https://github.com/CybercentreCanada/configextractor-py) | A tool designed to run extractors from multiple frameworks and uses the MACO model for output harmonization | [![License](https://img.shields.io/github/license/CybercentreCanada/configextractor-py)](https://github.com/CybercentreCanada/configextractor-py/blob/main/LICENSE.md) | 33 | | | A robust, multiprocessing-capable, multi-family RAT config parser/extractor that is compatible with MACO | [![License](https://img.shields.io/github/license/jeFF0Falltrades/rat_king_parser)](https://github.com/jeFF0Falltrades/rat_king_parser/blob/master/LICENSE) | 34 | | | A parser/extractor repository containing MACO extractors that's authored by the CAPE community but is integrated in [CAPE](https://github.com/kevoreilly/CAPEv2) deployments.
**Note: These MACO extractors wrap and parse the original CAPE extractors.** | [![License](https://img.shields.io/badge/license-GPL--3.0-informational)](https://github.com/kevoreilly/CAPEv2/blob/master/LICENSE) | 35 | 36 | ## Model Example 37 | 38 | See [the model definition](https://github.com/CybercentreCanada/Maco/blob/0f447a66de5e5ce8770ef3fe2325aec002842e63/maco/model.py#L127) for all the supported fields. 39 | You can use the model independently of the rest of the framework. 40 | This is still useful for compatibility between systems! 41 | 42 | ```python 43 | from maco import model 44 | # 'family' is the only required property on the model 45 | output = model.ExtractorModel(family="wanabee") 46 | output.version = "2019" # variant first found in 2019 47 | output.category.extend([model.CategoryEnum.cryptominer, model.CategoryEnum.clickfraud]) 48 | output.http.append(model.ExtractorModel.Http(protocol="https", 49 | uri="https://bad-domain.com/c2_payload", 50 | usage="c2")) 51 | output.tcp.append(model.ExtractorModel.Connection(server_ip="127.0.0.1", 52 | usage="ransom")) 53 | output.campaign_id.append("859186-3224-9284") 54 | output.inject_exe.append("explorer.exe") 55 | output.binaries.append( 56 | output.Binary( 57 | data=b"sam I am", 58 | datatype=output.Binary.TypeEnum.config, 59 | encryption=output.Binary.Encryption( 60 | algorithm="rot26", 61 | mode="block", 62 | ), 63 | ) 64 | ) 65 | # data about the malware that doesn't fit the model 66 | output.other["author_lunch"] = "green eggs and ham" 67 | output.other["author_lunch_time"] = "3pm" 68 | print(output.model_dump(exclude_defaults=True)) 69 | 70 | # Generated model 71 | { 72 | 'family': 'wanabee', 73 | 'version': '2019', 74 | 'category': ['cryptominer', 'clickfraud'], 75 | 'campaign_id': ['859186-3224-9284'], 76 | 'inject_exe': ['explorer.exe'], 77 | 'other': {'author_lunch': 'green eggs and ham', 'author_lunch_time': '3pm'}, 78 | 'http': [{'uri': 'https://bad-domain.com/c2_payload', 'usage': 'c2', 'protocol': 'https'}], 79 | 'tcp': [{'server_ip': '127.0.0.1', 'usage': 'ransom'}], 80 | 'binaries': [{ 81 | 'datatype': 'config', 'data': b'sam I am', 82 | 'encryption': {'algorithm': 'rot26', 'mode': 'block'} 83 | }] 84 | } 85 | ``` 86 | 87 | And you can create model instances from dictionaries: 88 | 89 | ```python 90 | from maco import model 91 | output = { 92 | "family": "wanabee2", 93 | "version": "2022", 94 | "ssh": [ 95 | { 96 | "username": "wanna", 97 | "password": "bee2", 98 | "hostname": "10.1.10.100", 99 | } 100 | ], 101 | } 102 | print(model.ExtractorModel(**output)) 103 | 104 | # Generated model 105 | family='wanabee2' version='2022' category=[] attack=[] capability_enabled=[] 106 | capability_disabled=[] campaign_id=[] identifier=[] decoded_strings=[] 107 | password=[] mutex=[] pipe=[] sleep_delay=None inject_exe=[] other={} 108 | binaries=[] ftp=[] smtp=[] http=[] 109 | ssh=[SSH(username='wanna', password='bee2', hostname='10.1.10.100', port=None, usage=None)] 110 | proxy=[] dns=[] tcp=[] udp=[] encryption=[] service=[] cryptocurrency=[] 111 | paths=[] registry=[] 112 | ``` 113 | 114 | ## Extractor Example 115 | 116 | The following extractor will trigger on any file with more than 50 ELF sections, 117 | and set some properties in the model. 118 | 119 | Your extractors will do a better job of finding useful information than this one! 120 | 121 | ```python 122 | class Elfy(extractor.Extractor): 123 | """Check basic elf property.""" 124 | 125 | family = "elfy" 126 | author = "blue" 127 | last_modified = "2022-06-14" 128 | yara_rule = """ 129 | import "elf" 130 | 131 | rule Elfy 132 | { 133 | condition: 134 | elf.number_of_sections > 50 135 | } 136 | """ 137 | 138 | def run( 139 | self, stream: BytesIO, matches: List[yara.Match] 140 | ) -> Optional[model.ExtractorModel]: 141 | # return config model formatted results 142 | ret = model.ExtractorModel(family=self.family) 143 | # the list for campaign_id already exists and is empty, so we just add an item 144 | ret.campaign_id.append(str(len(stream.read()))) 145 | return ret 146 | ``` 147 | 148 | ## Writing Extractors 149 | 150 | There are several examples that use Maco in the '`demo_extractors`' folder. 151 | 152 | Some things to keep in mind: 153 | 154 | - The Yara rule names must be prefixed with the extractor class name. 155 | - e.g. Class 'MyScript' has Yara rules named 'MyScriptDetect1' and 'MyScriptDetect2', not 'Detect1' 156 | - You can load other scripts contained within the same folder via a Python relative import 157 | - See `complex.py` for details 158 | - You can standardise your usage of the '`other`' dict 159 | - This is optional, see `limit_other.py` for details 160 | - Consider instead making a PR with the properties you are frequently using 161 | 162 | # Requirements 163 | 164 | Python 3.8+. 165 | 166 | Install this package with `pip install maco`. 167 | 168 | All required Python packages are in the `requirements.txt`. 169 | 170 | # CLI Usage 171 | 172 | ```bash 173 | > maco --help 174 | usage: maco [-h] [-v] [--pretty] [--base64] [--logfile LOGFILE] [--include INCLUDE] [--exclude EXCLUDE] [-f] [--create_venv] extractors samples 175 | 176 | Run extractors over samples. 177 | 178 | positional arguments: 179 | extractors path to extractors 180 | samples path to samples 181 | 182 | optional arguments: 183 | -h, --help show this help message and exit 184 | -v, --verbose print debug logging. -v extractor info, -vv extractor debug, -vvv cli debug 185 | --pretty pretty print json output 186 | --base64 Include base64 encoded binary data in output (can be large, consider printing to file rather than console) 187 | --logfile LOGFILE file to log output 188 | --include INCLUDE comma separated extractors to run 189 | --exclude EXCLUDE comma separated extractors to not run 190 | -f, --force ignore yara rules and execute all extractors 191 | --create_venv Creates venvs for every requirements.txt found (only applies when extractor path is a directory) 192 | ``` 193 | 194 | ## CLI output example 195 | 196 | The CLI is helpful for using your extractors in a standalone system, such as in a reverse engineering environment. 197 | 198 | ```bash 199 | > maco demo_extractors/ /usr/lib --include Complex 200 | extractors loaded: ['Complex'] 201 | 202 | complex by blue 2022-06-14 TLP:WHITE 203 | This script has multiple yara rules and coverage of the data model. 204 | 205 | path: /usr/lib/udev/hwdb.bin 206 | run Complex extractor from rules ['ComplexAlt'] 207 | {"family": "complex", "version": "5", "decoded_strings": ["Paradise"], 208 | "binaries": [{"datatype": "payload", "size": 9, "hex_sample": "736F6D652064617461", "sha256": "1307990e6ba5ca145eb35e99182a9bec46531bc54ddf656a602c780fa0240dee", 209 | "encryption": {"algorithm": "something"}}], 210 | "http": [{"protocol": "https", "hostname": "blarg5.com", "path": "/malz/9956330", "usage": "c2"}], 211 | "encryption": [{"algorithm": "sha256"}]} 212 | 213 | path: /usr/lib/udev/hwdb.d/20-OUI.hwdb 214 | run Complex extractor from rules ['ComplexAlt'] 215 | {"family": "complex", "version": "5", "decoded_strings": ["Paradise"], 216 | "binaries": [{"datatype": "payload", "size": 9, "hex_sample": "736F6D652064617461", "sha256": "1307990e6ba5ca145eb35e99182a9bec46531bc54ddf656a602c780fa0240dee", 217 | "encryption": {"algorithm": "something"}}], 218 | "http": [{"protocol": "https", "hostname": "blarg5.com", "path": "/malz/1986908", "usage": "c2"}], 219 | "encryption": [{"algorithm": "sha256"}]} 220 | 221 | path: /usr/lib/udev/hwdb.d/20-usb-vendor-model.hwdb 222 | run Complex extractor from rules ['ComplexAlt'] 223 | {"family": "complex", "version": "5", "decoded_strings": ["Paradise"], 224 | "binaries": [{"datatype": "payload", "size": 9, "hex_sample": "736F6D652064617461", "sha256": "1307990e6ba5ca145eb35e99182a9bec46531bc54ddf656a602c780fa0240dee", 225 | "encryption": {"algorithm": "something"}}], 226 | "http": [{"protocol": "https", "hostname": "blarg5.com", "path": "/malz/1257481", "usage": "c2"}], 227 | "encryption": [{"algorithm": "sha256"}]} 228 | 229 | 230 | 15884 analysed, 3 hits, 3 extracted 231 | ``` 232 | 233 | The demo extractors are designed to trigger when run over the '`demo_extractors`' folder. 234 | 235 | e.g. `maco demo_extractors demo_extractors` 236 | 237 | # Contributions 238 | 239 | Please use ruff to format and lint PRs. This may be the cause of PR test failures. 240 | 241 | Ruff will attempt to fix most issues, but some may require manual resolution. 242 | 243 | ``` 244 | pip install ruff 245 | ruff format 246 | ruff check --fix 247 | ``` 248 | -------------------------------------------------------------------------------- /tests/test_model.py: -------------------------------------------------------------------------------- 1 | """Model validation testing.""" 2 | 3 | import unittest 4 | from typing import Dict 5 | 6 | from pydantic import ValidationError 7 | 8 | from maco import collector, model 9 | 10 | 11 | class TestModelObject(unittest.TestCase): 12 | """Test the model object.""" 13 | 14 | maxDiff = None 15 | 16 | def test_model_invalid(self): 17 | """Test invalid model.""" 18 | # family not supplied 19 | self.assertRaises(ValidationError, model.ExtractorModel) 20 | 21 | ret = model.ExtractorModel(family="octopus") 22 | # invalid property 23 | self.assertRaises(ValueError, setattr, *(ret, "invalid", 12345)) 24 | # invalid type 25 | ret.sleep_delay = "test" 26 | self.assertRaises(ValidationError, collector._verify_response, ret) 27 | 28 | def test_model_object_1(self): 29 | """Test the model object with basic requirements.""" 30 | # object example 31 | tmp = model.ExtractorModel(family="scuba") 32 | tmp.campaign_id.append("5467") 33 | self.verify(tmp, {"family": "scuba", "campaign_id": ["5467"]}) 34 | 35 | def test_model_object_2(self): 36 | """Test the model object with more data.""" 37 | em = model.ExtractorModel 38 | tmp = model.ExtractorModel( 39 | family="scuba", 40 | version="lotso_stuff", 41 | category=[], 42 | attack=[], 43 | capability_enabled=[], 44 | capability_disabled=[], 45 | campaign_id=["32"], 46 | identifier=["uxuduxuduxuudux"], 47 | decoded_strings=["there", "are", "some", "strings"], 48 | password=["hunter2"], 49 | mutex=["YEAH"], 50 | pipe=["xiod"], 51 | sleep_delay=45000, 52 | sleep_delay_jitter=2500, 53 | inject_exe=["Teams.exe"], 54 | other={"misc_data": {"nested": 5}}, 55 | binaries=[ 56 | em.Binary( 57 | datatype=None, 58 | data=b"\x10\x20\x30\x40", 59 | other={ 60 | "datatype": ["payload"], 61 | "extension": [".invalid"], 62 | "label": ["xor 0x04 at 0x2130-0x2134"], 63 | "some_junk": [1, 2, 3, 4, 5, 6], 64 | }, 65 | encryption=em.Binary.Encryption( 66 | algorithm="alxor", 67 | public_key=None, 68 | key=None, 69 | provider=None, 70 | mode=None, 71 | iv=None, 72 | seed=None, 73 | nonce=None, 74 | constants=[], 75 | usage="binary", 76 | ), 77 | ), 78 | em.Binary( 79 | datatype=None, 80 | data=b"\x50\x60\x70\x80", 81 | other={"datatype": ["payload"]}, 82 | encryption=[ 83 | em.Binary.Encryption( 84 | algorithm="alxor", 85 | public_key=None, 86 | key=None, 87 | provider=None, 88 | mode=None, 89 | iv=None, 90 | seed=None, 91 | nonce=None, 92 | constants=[], 93 | usage="binary", 94 | ), 95 | em.Binary.Encryption( 96 | algorithm="RC4", 97 | public_key=None, 98 | key=None, 99 | provider=None, 100 | mode=None, 101 | iv=None, 102 | seed=None, 103 | nonce=None, 104 | constants=[], 105 | usage="binary", 106 | ), 107 | ], 108 | ), 109 | ], 110 | ftp=[ 111 | em.FTP( 112 | username=None, 113 | password=None, 114 | hostname="somewhere", 115 | port=None, 116 | path=None, 117 | usage="c2", 118 | ) 119 | ], 120 | smtp=[ 121 | em.SMTP( 122 | username=None, 123 | password=None, 124 | hostname="here.com", 125 | port=None, 126 | mail_to=[], 127 | mail_from=None, 128 | subject=None, 129 | usage="upload", 130 | ) 131 | ], 132 | http=[ 133 | em.Http( 134 | uri=None, 135 | protocol="https", 136 | username=None, 137 | password=None, 138 | hostname="blarg.com", 139 | port=None, 140 | path="/malz", 141 | query=None, 142 | fragment=None, 143 | user_agent=None, 144 | method=None, 145 | headers=None, 146 | max_size=None, 147 | usage="c2", 148 | ) 149 | ], 150 | ssh=[ 151 | em.SSH( 152 | username=None, 153 | password=None, 154 | hostname="bad.malware", 155 | port=None, 156 | usage="download", 157 | ) 158 | ], 159 | proxy=[ 160 | em.Proxy( 161 | protocol=None, 162 | username=None, 163 | password=None, 164 | hostname="192.168.0.80", 165 | port=None, 166 | usage="tunnel", 167 | ) 168 | ], 169 | icmp=[ 170 | em.ICMP( 171 | type=None, 172 | code=None, 173 | header="DEADBEEF", 174 | hostname="192.168.0.80", 175 | usage="c2", 176 | ) 177 | ], 178 | dns=[em.DNS(ip="123.21.21.21", port=None, usage="other")], 179 | tcp=[ 180 | em.Connection( 181 | client_ip=None, 182 | client_port=None, 183 | server_ip="73.21.32.43", 184 | server_domain=None, 185 | server_port=None, 186 | usage="c2", 187 | ) 188 | ], 189 | udp=[ 190 | em.Connection( 191 | client_ip=None, 192 | client_port=None, 193 | server_ip="73.21.32.43", 194 | server_domain=None, 195 | server_port=None, 196 | usage="c2", 197 | ) 198 | ], 199 | encryption=[ 200 | em.Encryption( 201 | algorithm="alxor", 202 | public_key=None, 203 | key=None, 204 | provider=None, 205 | mode=None, 206 | iv=None, 207 | seed=None, 208 | nonce=None, 209 | constants=[], 210 | usage="binary", 211 | ) 212 | ], 213 | service=[ 214 | em.Service( 215 | dll=None, 216 | name="DeviceMonitorSvc", 217 | display_name="DeviceMonitorSvc", 218 | description="Device Monitor Service", 219 | ) 220 | ], 221 | cryptocurrency=[ 222 | em.Cryptocurrency( 223 | coin="APE", 224 | address="689fdh658790d6dr987yth84iyth7er8gtrfohyt9", 225 | ransom_amount=None, 226 | usage="miner", 227 | ) 228 | ], 229 | paths=[ 230 | em.Path(path="C:/Windows/system32", usage="install"), 231 | em.Path(path="C:/user/USERNAME/xxxxx/xxxxx/", usage="logs"), 232 | em.Path(path="\\here\\is\\some\\place", usage="install"), 233 | ], 234 | registry=[ 235 | em.Registry(key="HKLM_LOCAL_USER/some/location/to/key", usage="store_data"), 236 | em.Registry(key="HKLM_LOCAL_USER/system/location", usage="read"), 237 | ], 238 | ) 239 | self.verify( 240 | tmp, 241 | { 242 | "family": "scuba", 243 | "version": "lotso_stuff", 244 | "campaign_id": ["32"], 245 | "identifier": ["uxuduxuduxuudux"], 246 | "decoded_strings": ["there", "are", "some", "strings"], 247 | "password": ["hunter2"], 248 | "mutex": ["YEAH"], 249 | "pipe": ["xiod"], 250 | "sleep_delay": 45000, 251 | "sleep_delay_jitter": 2500, 252 | "icmp": [{"header": "DEADBEEF", "hostname": "192.168.0.80", "usage": "c2"}], 253 | "inject_exe": ["Teams.exe"], 254 | "other": {"misc_data": {"nested": 5}}, 255 | "binaries": [ 256 | { 257 | "data": b"\x10 0@", 258 | "other": { 259 | "datatype": ["payload"], 260 | "extension": [".invalid"], 261 | "label": ["xor 0x04 at 0x2130-0x2134"], 262 | "some_junk": [1, 2, 3, 4, 5, 6], 263 | }, 264 | "encryption": {"algorithm": "alxor", "usage": "binary"}, 265 | }, 266 | { 267 | "data": b"P`p\x80", 268 | "other": {"datatype": ["payload"]}, 269 | "encryption": [ 270 | {"algorithm": "alxor", "usage": "binary"}, 271 | {"algorithm": "RC4", "usage": "binary"}, 272 | ], 273 | }, 274 | ], 275 | "ftp": [{"hostname": "somewhere", "usage": "c2"}], 276 | "smtp": [{"hostname": "here.com", "usage": "upload"}], 277 | "http": [ 278 | { 279 | "protocol": "https", 280 | "hostname": "blarg.com", 281 | "path": "/malz", 282 | "usage": "c2", 283 | } 284 | ], 285 | "ssh": [{"hostname": "bad.malware", "usage": "download"}], 286 | "proxy": [{"hostname": "192.168.0.80", "usage": "tunnel"}], 287 | "dns": [{"ip": "123.21.21.21", "usage": "other"}], 288 | "tcp": [{"server_ip": "73.21.32.43", "usage": "c2"}], 289 | "udp": [{"server_ip": "73.21.32.43", "usage": "c2"}], 290 | "encryption": [{"algorithm": "alxor", "usage": "binary"}], 291 | "service": [ 292 | { 293 | "name": "DeviceMonitorSvc", 294 | "display_name": "DeviceMonitorSvc", 295 | "description": "Device Monitor Service", 296 | } 297 | ], 298 | "cryptocurrency": [ 299 | { 300 | "coin": "APE", 301 | "address": "689fdh658790d6dr987yth84iyth7er8gtrfohyt9", 302 | "usage": "miner", 303 | } 304 | ], 305 | "paths": [ 306 | {"path": "C:/Windows/system32", "usage": "install"}, 307 | {"path": "C:/user/USERNAME/xxxxx/xxxxx/", "usage": "logs"}, 308 | {"path": "\\here\\is\\some\\place", "usage": "install"}, 309 | ], 310 | "registry": [ 311 | { 312 | "key": "HKLM_LOCAL_USER/some/location/to/key", 313 | "usage": "store_data", 314 | }, 315 | {"key": "HKLM_LOCAL_USER/system/location", "usage": "read"}, 316 | ], 317 | }, 318 | ) 319 | 320 | def verify(self, in1, in2: Dict) -> Dict: 321 | """Verify the returned data matches the schema.""" 322 | resp = collector._verify_response(in1) 323 | self.assertEqual(resp, in2) 324 | 325 | 326 | class TestModelDict(unittest.TestCase): 327 | """Test verifying dicts against the schema.""" 328 | 329 | def test_model_1(self): 330 | """Test the model object with basic requirements.""" 331 | # dict example 332 | self.verify( 333 | { 334 | "family": "scuba", 335 | "version": "30-01-2023", 336 | "http": [ 337 | { 338 | "protocol": "https", 339 | "hostname": "blarg.com", 340 | "path": "/malz", 341 | "usage": "c2", 342 | } 343 | ], 344 | } 345 | ) 346 | 347 | def test_model_2(self): 348 | """Test the model object with more data.""" 349 | # dict example large 350 | self.maxDiff = None 351 | 352 | self.verify( 353 | { 354 | "family": "scuba", 355 | "version": "lotso_stuff", 356 | "binaries": [ 357 | { 358 | "data": rb"\x10\x20\x30\x40", 359 | "encryption": {"algorithm": "alxor", "usage": "binary"}, 360 | "other": { 361 | "datatype": ["payload"], 362 | "extension": [".invalid"], 363 | "label": ["xor 0x04 at 0x2130-0x2134"], 364 | "some_junk": [1, 2, 3, 4, 5, 6], 365 | }, 366 | }, 367 | { 368 | "data": rb"\x50\x60\x70\x80", 369 | "encryption": [ 370 | {"algorithm": "alxor", "usage": "binary"}, 371 | {"algorithm": "RC4", "usage": "binary"}, 372 | ], 373 | "other": { 374 | "datatype": ["payload"], 375 | }, 376 | }, 377 | ], 378 | "ftp": [{"hostname": "somewhere", "usage": "c2"}], 379 | "smtp": [{"hostname": "here.com", "usage": "upload"}], 380 | "http": [ 381 | { 382 | "protocol": "https", 383 | "hostname": "blarg.com", 384 | "path": "/malz", 385 | "usage": "c2", 386 | } 387 | ], 388 | "ssh": [{"hostname": "bad.malware", "usage": "download"}], 389 | "proxy": [{"hostname": "192.168.0.80", "usage": "tunnel"}], 390 | "dns": [{"ip": "123.21.21.21", "usage": "other"}], 391 | "tcp": [{"server_ip": "73.21.32.43", "usage": "c2"}], 392 | "udp": [{"server_ip": "73.21.32.43", "usage": "c2"}], 393 | "encryption": [{"algorithm": "alxor", "usage": "binary"}], 394 | "service": [ 395 | { 396 | "name": "DeviceMonitorSvc", 397 | "display_name": "DeviceMonitorSvc", 398 | "description": "Device Monitor Service", 399 | } 400 | ], 401 | "cryptocurrency": [ 402 | { 403 | "coin": "APE", 404 | "address": "689fdh658790d6dr987yth84iyth7er8gtrfohyt9", 405 | "usage": "miner", 406 | } 407 | ], 408 | "paths": [ 409 | {"path": "C:/Windows/system32", "usage": "install"}, 410 | {"path": "C:/user/USERNAME/xxxxx/xxxxx/", "usage": "logs"}, 411 | {"path": "\\here\\is\\some\\place", "usage": "install"}, 412 | ], 413 | "registry": [ 414 | { 415 | "key": "HKLM_LOCAL_USER/some/location/to/key", 416 | "usage": "store_data", 417 | }, 418 | {"key": "HKLM_LOCAL_USER/system/location", "usage": "read"}, 419 | ], 420 | "campaign_id": ["32"], 421 | "identifier": ["uxuduxuduxuudux"], 422 | "decoded_strings": ["there", "are", "some", "strings"], 423 | "password": ["hunter2"], 424 | "mutex": ["YEAH"], 425 | "pipe": ["xiod"], 426 | "sleep_delay": 45000, 427 | "inject_exe": ["Teams.exe"], 428 | "other": {"misc_data": {"nested": 5}}, 429 | } 430 | ) 431 | 432 | def verify(self, config: Dict) -> Dict: 433 | """Verify the returned data matches the schema.""" 434 | tmp = model.ExtractorModel.model_validate(config) 435 | resp = collector._verify_response(tmp) 436 | self.assertEqual(resp, config) 437 | -------------------------------------------------------------------------------- /maco/utils.py: -------------------------------------------------------------------------------- 1 | """Common utilities shared between the MACO collector and configextractor-py.""" 2 | 3 | import importlib 4 | import inspect 5 | import json 6 | import logging 7 | import logging.handlers 8 | import os 9 | import re 10 | import shutil 11 | import subprocess 12 | import sys 13 | import tempfile 14 | from importlib.machinery import SourceFileLoader 15 | 16 | from multiprocess import Process, Queue 17 | 18 | from maco import yara 19 | 20 | if sys.version_info >= (3, 11): 21 | import tomllib 22 | else: 23 | import tomli as tomllib 24 | 25 | from base64 import b64decode 26 | from copy import deepcopy 27 | from glob import glob 28 | from logging import Logger 29 | from types import ModuleType 30 | from typing import Callable, Dict, List, Tuple, Union 31 | 32 | from uv import find_uv_bin 33 | 34 | from maco import model 35 | from maco.exceptions import AnalysisAbortedException 36 | from maco.extractor import Extractor 37 | 38 | logger = logging.getLogger("maco.lib.utils") 39 | 40 | VENV_DIRECTORY_NAME = ".venv" 41 | 42 | RELATIVE_FROM_RE = re.compile(rb"from (\.+)") 43 | RELATIVE_FROM_IMPORT_RE = re.compile(rb"from (\.+) import") 44 | 45 | UV_BIN = find_uv_bin() 46 | 47 | PIP_CMD = f"{UV_BIN} pip" 48 | VENV_CREATE_CMD = f"{UV_BIN} venv" 49 | 50 | 51 | class Base64Decoder(json.JSONDecoder): 52 | """JSON decoder that also base64 encodes binary data.""" 53 | 54 | def __init__(self, *args, **kwargs): 55 | """Initialize the decoder.""" 56 | json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs) 57 | 58 | def object_hook(self, obj): 59 | """Hook to decode base64 encoded binary data.""" # noqa: DOC201 60 | if "__class__" not in obj: 61 | return obj 62 | type = obj["__class__"] 63 | if type == "bytes": 64 | return b64decode(obj["data"]) 65 | return obj 66 | 67 | 68 | VENV_SCRIPT = """ 69 | import importlib 70 | import json 71 | import os 72 | import sys 73 | import logging 74 | 75 | try: 76 | # Respect cases where the extractor is tied to certain version of yara-python for processing 77 | import yara 78 | except: 79 | # Otherwise fallback to MACO's interface for yara-python==4.5.x 80 | from maco import yara 81 | 82 | from base64 import b64encode 83 | 84 | # ensure we have a logger to stderr 85 | import logging 86 | logger = logging.getLogger() 87 | logger.setLevel(logging.DEBUG) 88 | sh = logging.StreamHandler() 89 | logger.addHandler(sh) 90 | sh.setLevel(logging.DEBUG) 91 | formatter = logging.Formatter( 92 | fmt="%(asctime)s, [%(levelname)s] %(module)s.%(funcName)s: %(message)s", datefmt="%Y-%m-%d (%H:%M:%S)" 93 | ) 94 | sh.setFormatter(formatter) 95 | 96 | parent_package_path = "{parent_package_path}" 97 | sys.path.insert(1, parent_package_path) 98 | mod = importlib.import_module("{module_name}") 99 | 100 | class Base64Encoder(json.JSONEncoder): 101 | def default(self, o): 102 | if isinstance(o, bytes): 103 | return dict(__class__="bytes", data=b64encode(o).decode()) 104 | return json.JSONEncoder.default(self, o) 105 | matches = [] 106 | if mod.{module_class}.yara_rule: 107 | matches = yara.compile(source=mod.{module_class}.yara_rule).match("{sample_path}") 108 | result = mod.{module_class}().run(open("{sample_path}", 'rb'), matches=matches) 109 | 110 | with open("{output_path}", 'w') as fp: 111 | if not result: 112 | json.dump(dict(), fp) 113 | else: 114 | try: 115 | json.dump(result.model_dump(exclude_defaults=True, exclude_none=True), fp, cls=Base64Encoder) 116 | except AttributeError: 117 | # venv likely has an older version of Pydantic < 2 installed 118 | json.dump(result.dict(exclude_defaults=True, exclude_none=True), fp, cls=Base64Encoder) 119 | """ 120 | 121 | MACO_YARA_RULE = r""" 122 | rule MACO { 123 | meta: 124 | desc = "Used to match on Python files that contain MACO extractors" 125 | strings: 126 | $from = "from maco" 127 | $import = "import maco" 128 | $extractor = "Extractor" 129 | $class = /class \w+\(([a-zA-Z.]+)?Extractor\)\:/ 130 | condition: 131 | ($from or $import) and $extractor and $class 132 | } 133 | """ 134 | 135 | 136 | def maco_extractor_validation(module: ModuleType) -> bool: 137 | """Validation function for extractors. 138 | 139 | Returns: 140 | (bool): True if extractor belongs to MACO, False otherwise. 141 | """ 142 | if inspect.isclass(module): 143 | # 'author' has to be implemented otherwise will raise an exception according to MACO 144 | return hasattr(module, "author") and module.author 145 | return False 146 | 147 | 148 | def maco_extract_rules(module: Extractor) -> str: 149 | """Extracts YARA rules from extractor. 150 | 151 | Returns: 152 | (str): YARA rules 153 | """ 154 | return module.yara_rule 155 | 156 | 157 | def scan_for_extractors(root_directory: str, scanner: yara.Rules, logger: Logger) -> Tuple[List[str], List[str]]: 158 | """Looks for extractors using YARA rules. 159 | 160 | Args: 161 | root_directory (str): Root directory containing extractors 162 | scanner (yara.Rules): Scanner to look for extractors using YARA rules 163 | logger (Logger): Logger to use 164 | 165 | Returns: 166 | Tuple[List[str], List[str]]: Returns a list of extractor directories and extractor files 167 | 168 | """ 169 | extractor_dirs = set([root_directory]) 170 | extractor_files = [] 171 | 172 | def scan_and_repair(directory, package=None): 173 | nodes = os.listdir(directory) 174 | 175 | if "__init__.py" in nodes and not package and "-" not in os.path.basename(directory): 176 | # Perhaps we've found the outermost package? 177 | package = os.path.basename(directory) 178 | 179 | for node in nodes: 180 | path = os.path.join(directory, node) 181 | if node == VENV_DIRECTORY_NAME: 182 | # Ignore looking for extractors within packages 183 | continue 184 | elif not node.endswith(".py") and os.path.isfile(path): 185 | # Ignore scanning non-Python files 186 | continue 187 | elif node in ["setup.py"]: 188 | # Ignore setup files and markers for package directories 189 | continue 190 | elif "test" in node: 191 | # Ignore test files 192 | continue 193 | elif "deprecated" in node: 194 | # Ignore deprecated files 195 | continue 196 | 197 | if os.path.isfile(os.path.join(directory, node)): 198 | # Scan Python file for potential extractors 199 | if package: 200 | # Inspect the contents and look for any relative import issues 201 | with open(path, "rb") as f: 202 | data = f.read() 203 | 204 | # Replace any relative importing with absolute 205 | changed_imports = False 206 | curr_dir = os.path.dirname(path) 207 | split = curr_dir.split("/")[::-1] 208 | for pattern in [RELATIVE_FROM_IMPORT_RE, RELATIVE_FROM_RE]: 209 | for match in pattern.findall(data): 210 | depth = match.count(b".") 211 | abspath = ".".join(split[depth - 1 : split.index(package) + 1][::-1]) 212 | abspath += "." if pattern == RELATIVE_FROM_RE else "" 213 | data = data.replace(f"from {match.decode()}".encode(), f"from {abspath}".encode(), 1) 214 | changed_imports = True 215 | 216 | # only write extractor files if imports were changed 217 | if changed_imports: 218 | with open(path, "wb") as f: 219 | f.write(data) 220 | 221 | if scanner.match(path): 222 | # Add directory to list of hits for venv creation 223 | extractor_dirs.add(directory) 224 | extractor_files.append(os.path.realpath(path)) 225 | else: 226 | scan_and_repair(path, package) 227 | 228 | # Search for extractors using YARA rules 229 | logger.info("Searching for prospective extractors based on YARA rules..") 230 | scan_and_repair(root_directory) 231 | 232 | return extractor_dirs, extractor_files 233 | 234 | 235 | def _install_required_packages(create_venv: bool, directories: List[str], python_version: str, logger: Logger): 236 | venvs = [] 237 | env = deepcopy(os.environ) 238 | stop_directory = os.path.dirname(sorted(directories)[0]) 239 | # Track directories that we've already visited 240 | visited_dirs = [] 241 | for dir in directories: 242 | # Recurse backwards through the directory structure to look for package requirements 243 | while dir != stop_directory and dir not in visited_dirs: 244 | req_files = list({"requirements.txt", "pyproject.toml"}.intersection(set(os.listdir(dir)))) 245 | if req_files: 246 | # create a virtual environment, otherwise directly install into current env 247 | if create_venv: 248 | venv_path = os.path.join(dir, VENV_DIRECTORY_NAME) 249 | logger.info(f"Updating virtual environment {venv_path}") 250 | env.update({"VIRTUAL_ENV": venv_path}) 251 | # Create a virtual environment for the directory 252 | if not os.path.exists(venv_path): 253 | cmd = f"{VENV_CREATE_CMD} --python {python_version}" 254 | subprocess.run(cmd.split(" ") + [venv_path], capture_output=True, env=env) 255 | 256 | # Install/Update the packages in the environment 257 | install_command = PIP_CMD.split(" ") + ["install"] 258 | # When running locally, only install packages to required spec. 259 | # This prevents issues during maco development and building extractors against local libraries. 260 | if create_venv: 261 | # when running in custom virtual environment, always upgrade packages. 262 | install_command.extend(["--upgrade", "--no-cache"]) 263 | 264 | # Update the pip install command depending on where the dependencies are coming from 265 | if "requirements.txt" in req_files: 266 | # Perform a pip install using the requirements flag 267 | install_command.extend(["--requirements", "requirements.txt"]) 268 | elif "pyproject.toml" in req_files: 269 | # Assume we're dealing with a project directory 270 | pyproject_command = ["--editable", "."] 271 | 272 | # Check to see if there are optional dependencies required 273 | with open(os.path.join(dir, "pyproject.toml"), "rb") as f: 274 | parsed_toml_project = tomllib.load(f).get("project", {}) 275 | for dep_name, dependencies in parsed_toml_project.get("optional-dependencies", {}).items(): 276 | # Look for the dependency that hints at use of MACO for the extractors 277 | if "maco" in " ".join(dependencies): 278 | pyproject_command = [f".[{dep_name}]"] 279 | break 280 | 281 | install_command.extend(pyproject_command) 282 | 283 | # Always require maco-extractor to be installed 284 | install_command.append("maco-extractor") 285 | logger.debug(f"Install command: {' '.join(install_command)} [{dir}]") 286 | # this uses VIRTUAL_ENV to control usage of a virtual environment 287 | p = subprocess.run( 288 | install_command, 289 | cwd=dir, 290 | capture_output=True, 291 | env=env, 292 | ) 293 | if p.returncode != 0: 294 | if b"is being installed using the legacy" in p.stderr: 295 | # Ignore these types of errors 296 | continue 297 | logger.error(f"Error installing into venv:\n{p.stdout.decode()}\n{p.stderr.decode()}") 298 | else: 299 | logger.debug(f"Installed dependencies into venv:\n{p.stdout.decode()}\n{p.stderr.decode()}") 300 | if create_venv: 301 | venvs.append(venv_path) 302 | 303 | # Cleanup any build directories that are the product of package installation 304 | expected_build_path = os.path.join(dir, "build") 305 | if os.path.exists(expected_build_path): 306 | shutil.rmtree(expected_build_path) 307 | 308 | # Add directories to our visited list and check the parent of this directory on the next loop 309 | visited_dirs.append(dir) 310 | dir = os.path.dirname(dir) 311 | return venvs 312 | 313 | 314 | def find_and_insert_venv(path: str, venvs: List[str]) -> Tuple[str, str]: 315 | """Finds the closest virtual environment to the extractor and inserts it into the PATH. 316 | 317 | Args: 318 | path (str): Path of extractor 319 | venvs (List[str]): List of virtual environments 320 | 321 | Returns: 322 | (Tuple[str, str]): Virtual environment and site-packages path that's closest to the extractor 323 | """ 324 | venv = None 325 | for venv in sorted(venvs, reverse=True): 326 | venv_parent = os.path.dirname(venv) 327 | if path.startswith(f"{venv_parent}/"): 328 | # Found the virtual environment that's the closest to extractor 329 | break 330 | 331 | if not venv: 332 | return None, None 333 | 334 | if venv: 335 | # Insert the venv's site-packages into the PATH temporarily to load the module 336 | for site_package in glob(os.path.join(venv, "lib/python*/site-packages")): 337 | if site_package not in sys.path: 338 | sys.path.insert(2, site_package) 339 | break 340 | 341 | return venv, site_package 342 | 343 | 344 | def register_extractor_module( 345 | extractor_source_file: str, 346 | module_name: str, 347 | venvs: List[str], 348 | extractor_module_callback: Callable[[ModuleType, str], None], 349 | logger: Logger, 350 | ): 351 | """Register the extractor module in isolation. 352 | 353 | Args: 354 | extractor_source_file (str): Path to source file of extractor 355 | module_name (str): The name of the module relative to the package directory 356 | venvs (List[str]): List of virtual environments 357 | extractor_module_callback (Callable[[ModuleType, str], None]): Callback used to register extractors 358 | logger (Logger): Logger to use 359 | 360 | """ 361 | try: 362 | logger.info(f"Inspecting '{extractor_source_file}' for extractors..") 363 | venv, site_packages = find_and_insert_venv(extractor_source_file, venvs) 364 | loader = SourceFileLoader( 365 | module_name, 366 | extractor_source_file, 367 | ) 368 | extractor_module_callback(loader.load_module(), venv) 369 | finally: 370 | # Cleanup virtual environment that was loaded into PATH 371 | if venv and site_packages in sys.path: 372 | sys.path.remove(site_packages) 373 | 374 | 375 | def register_extractors( 376 | current_directory: str, 377 | venvs: List[str], 378 | extractor_files: List[str], 379 | extractor_module_callback: Callable[[ModuleType, str], None], 380 | logger: Logger, 381 | ): 382 | """Register extractors with in the current directory. 383 | 384 | Args: 385 | current_directory (str): Current directory to register extractors found 386 | venvs (List[str]): List of virtual environments 387 | extractor_files (List[str]): List of extractor files found 388 | extractor_module_callback (Callable[[ModuleType, str], None]): Callback used to register extractors 389 | logger (Logger): Logger to use 390 | """ 391 | package_name = os.path.basename(current_directory) 392 | parent_directory = os.path.dirname(current_directory) 393 | if venvs and package_name in sys.modules: 394 | # this may happen as part of testing if some part of the extractor code was directly imported 395 | logger.warning( 396 | f"Looks like {package_name} is already loaded. " 397 | "If your maco extractor overlaps an existing package name this could cause problems." 398 | ) 399 | 400 | try: 401 | # Modify the PATH so we can recognize this new package on import 402 | sys.path.insert(1, current_directory) 403 | sys.path.insert(1, parent_directory) 404 | 405 | # Load the potential extractors directly from the source file 406 | registration_processes = [] 407 | for extractor_source_file in extractor_files: 408 | module_name = extractor_source_file.replace(f"{parent_directory}/", "").replace("/", ".")[:-3] 409 | p = Process( 410 | target=register_extractor_module, 411 | args=(extractor_source_file, module_name, venvs, extractor_module_callback, logger), 412 | ) 413 | p.start() 414 | registration_processes.append(p) 415 | 416 | for p in registration_processes: 417 | p.join() 418 | 419 | finally: 420 | # Cleanup changes made to PATH 421 | sys.path.remove(parent_directory) 422 | sys.path.remove(current_directory) 423 | 424 | 425 | def proxy_logging(queue: Queue, callback: Callable[[ModuleType, str], None], *args, **kwargs): 426 | """Ensures logging is set up correctly for a child process and then executes the callback.""" 427 | logger = logging.getLogger() 428 | qh = logging.handlers.QueueHandler(queue) 429 | qh.setLevel(logging.DEBUG) 430 | logger.addHandler(qh) 431 | callback(*args, **kwargs, logger=logger) 432 | 433 | 434 | def import_extractors( 435 | extractor_module_callback: Callable[[ModuleType, str], bool], 436 | *, 437 | root_directory: str, 438 | scanner: yara.Rules, 439 | create_venv: bool, 440 | logger: Logger, 441 | python_version: str = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}", 442 | skip_install: bool = False, 443 | ): 444 | """Import extractors in a given directory. 445 | 446 | Args: 447 | extractor_module_callback (Callable[[ModuleType, str], bool]): Callback used to register extractors 448 | root_directory (str): Root directory to look for extractors 449 | scanner (yara.Rules): Scanner to look for extractors that match YARA rule 450 | create_venv (bool): Create/Use virtual environments 451 | logger (Logger): Logger to use 452 | python_version (str): Version of python to use when creating virtual environments 453 | skip_install (bool): Skip installation of Python dependencies for extractors 454 | """ 455 | extractor_dirs, extractor_files = scan_for_extractors(root_directory, scanner, logger) 456 | 457 | logger.info(f"Extractor files found based on scanner ({len(extractor_files)}).") 458 | logger.debug(extractor_files) 459 | 460 | if not skip_install: 461 | # Install packages into the current environment or dynamically created virtual environments 462 | venvs = _install_required_packages(create_venv, extractor_dirs, python_version, logger) 463 | else: 464 | # Look for pre-existing virtual environments, if any 465 | logger.info("Checking for pre-existing virtual environment(s)..") 466 | venvs = [ 467 | os.path.join(root, VENV_DIRECTORY_NAME) 468 | for root, dirs, _ in os.walk(root_directory) 469 | if VENV_DIRECTORY_NAME in dirs 470 | ] 471 | 472 | # With the environment prepared, we can now hunt for the extractors and register them 473 | logger.info("Registering extractors..") 474 | register_extractors(root_directory, venvs, extractor_files, extractor_module_callback, logger) 475 | 476 | 477 | # holds cached extractors when not running in venv mode 478 | _loaded_extractors: Dict[str, Extractor] = {} 479 | 480 | 481 | def run_extractor( 482 | sample_path, 483 | module_name, 484 | extractor_class, 485 | module_path, 486 | venv, 487 | venv_script=VENV_SCRIPT, 488 | json_decoder=Base64Decoder, 489 | ) -> Union[Dict[str, dict], model.ExtractorModel]: 490 | """Runs the maco extractor against sample either in current process or child process. 491 | 492 | Args: 493 | sample_path (str): Path to sample 494 | module_name (str): Name of extractor module 495 | extractor_class (str): Name of extractor class in module 496 | module_path (str): Path to Python module containing extractor 497 | venv (str): Path to virtual environment associated to extractor 498 | venv_script (str): Script to run extractor in a virtual environment 499 | json_decoder (Base64Decoder): Decoder used for JSON 500 | 501 | Raises: 502 | AnalysisAbortedException: Raised when extractor voluntarily terminates execution 503 | Exception: Raised when extractor raises an exception 504 | 505 | Returns: 506 | Union[Dict[str, dict], model.ExtractorModel]: Results from extractor 507 | """ 508 | if not venv: 509 | key = f"{module_name}_{extractor_class}" 510 | if key not in _loaded_extractors: 511 | # dynamic import of extractor 512 | try: 513 | # Add the correct directory to the PATH before attempting to load the extractor 514 | import_path = module_path[: -4 - len(module_name)] 515 | sys.path.insert(1, import_path) 516 | mod = importlib.import_module(module_name) 517 | extractor_cls = mod.__getattribute__(extractor_class) 518 | extractor = extractor_cls() 519 | 520 | # Add to cache 521 | _loaded_extractors[key] = extractor 522 | finally: 523 | sys.path.pop(1) 524 | 525 | else: 526 | # retrieve cached extractor 527 | extractor = _loaded_extractors[key] 528 | if extractor.yara_compiled: 529 | matches = extractor.yara_compiled.match(sample_path) 530 | loaded = extractor.run(open(sample_path, "rb"), matches=matches) 531 | else: 532 | # execute extractor in child process with separate virtual environment 533 | # Write temporary script in the same directory as extractor to resolve relative imports 534 | python_exe = os.path.join(venv, "bin", "python") 535 | dirname = os.path.dirname(module_path) 536 | with tempfile.NamedTemporaryFile("w", dir=dirname, suffix=".py") as script: 537 | with tempfile.NamedTemporaryFile() as output: 538 | parent_package_path = dirname.rsplit(module_name.split(".", 1)[0], 1)[0] 539 | root_directory = module_path[:-3].rsplit(module_name.split(".", 1)[1].replace(".", "/"))[0] 540 | 541 | script.write( 542 | venv_script.format( 543 | parent_package_path=parent_package_path, 544 | module_name=module_name, 545 | module_class=extractor_class, 546 | sample_path=sample_path, 547 | output_path=output.name, 548 | ) 549 | ) 550 | script.flush() 551 | cwd = root_directory 552 | custom_module = script.name[:-3].replace(root_directory, "").replace("/", ".") 553 | 554 | if custom_module.startswith("src."): 555 | # src layout found, which means the actual module content is within 'src' directory 556 | custom_module = custom_module[4:] 557 | cwd = os.path.join(cwd, "src") 558 | 559 | # run the maco extractor in full venv process isolation (slow) 560 | proc = subprocess.run( 561 | [python_exe, "-m", custom_module], 562 | cwd=cwd, 563 | capture_output=True, 564 | ) 565 | stderr = proc.stderr.decode() 566 | try: 567 | # Load results and return them 568 | output.seek(0) 569 | loaded = json.load(output, cls=json_decoder) 570 | except Exception as e: 571 | # If there was an error raised during runtime, then propagate 572 | delim = f'File "{module_path}"' 573 | exception = stderr 574 | if delim in exception: 575 | exception = f"{delim}{exception.split(delim, 1)[1]}" 576 | if "maco.exceptions.AnalysisAbortedException" in exception: 577 | # Extractor voluntarily terminated, re-raise exception to be handled by collector 578 | raise AnalysisAbortedException( 579 | exception.split("maco.exceptions.AnalysisAbortedException: ")[-1] 580 | ) 581 | else: 582 | # print extractor logging at error level 583 | logger.error(f"maco extractor raised exception, stderr:\n{stderr}") 584 | raise Exception(exception) from e 585 | # ensure that extractor logging is available 586 | logger.info(f"maco extractor stderr:\n{stderr}") 587 | return loaded 588 | -------------------------------------------------------------------------------- /maco/model/model.py: -------------------------------------------------------------------------------- 1 | """Malware config extractor output model.""" 2 | 3 | from enum import Enum 4 | from typing import Any, Dict, List, Optional, Union 5 | 6 | from pydantic import BaseModel, ConfigDict 7 | 8 | 9 | class ForbidModel(BaseModel): 10 | """We want to forbid extra properties, so that the 'other' field is used instead.""" 11 | 12 | model_config = ConfigDict(extra="forbid", use_enum_values=True) 13 | 14 | 15 | class ConnUsageEnum(str, Enum): 16 | """Purpose of the connection.""" 17 | 18 | c2 = "c2" # issue commands to malware 19 | upload = "upload" # get data out of the network 20 | download = "download" # fetch dynamic config, second stage, etc 21 | propagate = "propagate" # spread through the network 22 | tunnel = "tunnel" # communicate through the network 23 | ransom = "ransom" # payment 24 | decoy = "decoy" # Decoy connections to obfuscate malicious 25 | other = "other" 26 | 27 | 28 | class Encryption(ForbidModel): 29 | """Encryption usage.""" 30 | 31 | class UsageEnum(str, Enum): 32 | """Purpose of the encryption.""" 33 | 34 | config = "config" 35 | communication = "communication" 36 | binary = "binary" 37 | ransom = "ransom" 38 | other = "other" 39 | 40 | algorithm: Optional[str] = None 41 | public_key: Optional[str] = None 42 | key: Optional[str] = None # private key or symmetric key 43 | provider: Optional[str] = None # encryption library used. openssl, homebrew, etc. 44 | 45 | mode: Optional[str] = None # block vs stream 46 | # base 64'd binary data for these details? 47 | # TODO to confirm usage of these different properties 48 | iv: Optional[str] = None # initialisation vector 49 | seed: Optional[str] = None 50 | nonce: Optional[str] = None 51 | constants: List[str] = [] 52 | 53 | usage: Optional[UsageEnum] = None 54 | 55 | 56 | class CategoryEnum(str, Enum): 57 | """Category of the malware.""" 58 | 59 | # Software that shows you extra promotions that you cannot control as you use your PC. 60 | # You wouldn't see the extra ads if you didn't have adware installed. 61 | adware = "adware" 62 | 63 | # Malware related to an Advanced Persistent Threat (APT) group. 64 | apt = "apt" 65 | 66 | # A backdoor Trojan gives malicious users remote control over the infected computer. 67 | # They enable the author to do anything they wish on the infected computer including 68 | # sending, receiving, launching and deleting files, displaying data and rebooting the computer. 69 | # Backdoor Trojans are often used to unite a group of victim computers to form a botnet or 70 | # zombie network that can be used for criminal purposes. 71 | backdoor = "backdoor" 72 | 73 | # Trojan Banker programs are designed to steal your account data for online banking systems, 74 | # e-payment systems and credit or debit cards. 75 | banker = "banker" 76 | 77 | # A malware variant that modifies the boot sectors of a hard drive, including the Master Boot Record (MBR) 78 | # and Volume Boot Record (VBR). 79 | bootkit = "bootkit" 80 | 81 | # A malicious bot is self-propagating malware designed to infect a host and connect back to a central server 82 | # or servers that act as a command and control (C&C) center for an entire network of compromised devices, 83 | # or botnet. 84 | bot = "bot" 85 | 86 | # A browser hijacker is defined as a form of unwanted software that modifies a web browser's settings without 87 | # the user's permission. The result is the placement of unwanted advertising into the browser, 88 | # and possibly the replacement of an existing home page or search page with the hijacker page. 89 | browser_hijacker = "browser_hijacker" 90 | 91 | # Trojan bruteforcer are trying to brute force website in order to achieve something else 92 | # (EX: Finding WordPress websites with default credentials). 93 | bruteforcer = "bruteforcer" 94 | 95 | # A type of trojan that can use your PC to 'click' on websites or applications. 96 | # They are usually used to make money for a malicious hacker by clicking on online advertisements 97 | # and making it look like the website gets more traffic than it does. 98 | # They can also be used to skew online polls, install programs on your PC, or make unwanted software 99 | # appear more popular than it is. 100 | clickfraud = "clickfraud" 101 | 102 | # Cryptocurrency mining malware. 103 | cryptominer = "cryptominer" 104 | 105 | # These programs conduct DoS (Denial of Service) attacks against a targeted web address. 106 | # By sending multiple requests from your computer and several other infected computers, 107 | # the attack can overwhelm the target address leading to a denial of service. 108 | ddos = "ddos" 109 | 110 | # Trojan Downloaders can download and install new versions of malicious programs in the target system. 111 | downloader = "downloader" 112 | 113 | # These programs are used by hackers in order to install malware or to prevent the detection of malicious programs. 114 | dropper = "dropper" 115 | 116 | # Exploit kits are programs that contain data or code that takes advantage of a vulnerability 117 | # within an application that is running in the target system. 118 | exploitkit = "exploitkit" 119 | 120 | # Trojan FakeAV programs simulate the activity of antivirus software. 121 | # They are designed to extort money in return for the detection and removal of threat, even though the 122 | # threats that they report are actually non-existent. 123 | fakeav = "fakeav" 124 | 125 | # A type of tool that can be used to allow and maintain unauthorized access to your PC. 126 | hacktool = "hacktool" 127 | 128 | # A program that collects your personal information, such as your browsing history, 129 | # and uses it without adequate consent. 130 | infostealer = "infostealer" 131 | 132 | # A keylogger monitors and logs every keystroke it can identify. 133 | # Once installed, the virus either keeps track of all the keys and stores the information locally, 134 | # after which the hacker needs physical access to the computer to retrieve the information, 135 | # or the logs are sent over the internet back to the hacker. 136 | keylogger = "keylogger" 137 | 138 | # A program that loads another application / memory space. 139 | loader = "loader" 140 | 141 | # A type of malware that hides its code and purpose to make it more difficult for 142 | # security software to detect or remove it. 143 | obfuscator = "obfuscator" 144 | 145 | # Point-of-sale malware is usually a type of malware that is used by cybercriminals to target point of sale (POS) 146 | # and payment terminals with the intent to obtain credit card and debit card information. 147 | pos = "pos" 148 | 149 | # This type of trojan allows unauthorized parties to use the infected computer as a proxy server 150 | # to access the Internet anonymously. 151 | proxy = "proxy" 152 | 153 | # A program that can be used by a remote hacker to gain access and control of an infected machine. 154 | rat = "rat" 155 | 156 | # This type of malware can modify data in the target computer so the operating system 157 | # will stop running correctly or the data is no longer accessible. 158 | # The criminal will only restore the computer state or data after a ransom is paid to them 159 | # (mostly using cryptocurrency). 160 | ransomware = "ransomware" 161 | 162 | # A reverse proxy is a server that receives requests from the internet and forwards them to a small set of servers. 163 | reverse_proxy = "reverse_proxy" 164 | 165 | # Rootkits are designed to conceal certain objects or activities in the system. 166 | # Often their main purpose is to prevent malicious programs being detected 167 | # in order to extend the period in which programs can run on an infected computer. 168 | rootkit = "rootkit" 169 | 170 | # This type of malware scan the internet / network(s) / system(s) / service(s) to collect information. 171 | # That information could be used later to perpetuate an cyber attack. 172 | scanner = "scanner" 173 | 174 | # Scareware is a form of malware which uses social engineering to cause shock, anxiety, 175 | # or the perception of a threat in order to manipulate users into buying unwanted software. 176 | scareware = "scareware" 177 | 178 | # Malware that is sending spam. 179 | spammer = "spammer" 180 | 181 | # Generic or Unknown Trojan 182 | trojan = "trojan" 183 | 184 | # A generic computer virus 185 | virus = "virus" 186 | 187 | # A type of malware that destroy the data. 188 | wiper = "wiper" 189 | 190 | # A web shell is a script that can be uploaded to a web server to enable remote administration of the machine. 191 | webshell = "webshell" 192 | 193 | # A type of malware that spreads to other PCs. 194 | worm = "worm" 195 | 196 | 197 | class ExtractorModel(ForbidModel): 198 | r"""Captured config/iocs, unpacked binaries and other malware properties from a robo-analyst. 199 | 200 | This model defines common fields for output of a script targeting a specific malware family. 201 | Usage of this model will allow for easier sharing of scripts between different authors and systems. 202 | The model will not define fields for all data that can be extracted from a binary, only the most common. 203 | This is to make it easier for authors to understand and use the model. 204 | 205 | This model can have new fields added in the future if they become more common, 206 | but the intent is to avoid removing or modifying existing fields, for backwards compatibility. 207 | 208 | Where data does not fit with the current model, the 'others' field should be used. 209 | Contents in this field is not defined by the model and verification/normalisation is up to 210 | the author and whatever systems run the scripts. 211 | If many decoders define similar data in the 'others' field, that field should be migrated to this model. 212 | 213 | The model must be kept relatively flat, with nested lists of dictionaries to be avoided. 214 | This is to make queries simpler to write in sql, elasticsearch and other storage systems. 215 | 216 | Malware and systems that investigate malware can do pretty much anything. 217 | This model needs to be simple and flexible to make sharing easy. 218 | Some things should be out of scope for this model. 219 | Responsibility for these things are up to authors and systems that use this model. 220 | 221 | Out of scope 222 | * Verifying anything in the 'others' dict, including that it is json-compatible. 223 | * We don't know anything about the structure 224 | * checking is json compatible requires dumping to json string, which can be slow 225 | * Connecting specific config items to malware behaviour catalog 226 | * i.e. "Persistence::Modify Registry" with 'registry' item from model (SYSTEM\ControlSet001\Services\) 227 | * due to complexity and normalisation difficulties 228 | * much malware behaviour is not related to specific config items 229 | * Normalisation/verification of individual properties 230 | * i.e. lowercase filepaths - some filesystems are case sensitive 231 | * i.e. checking registry hives match known - not enough SME and too complex for a simple model 232 | * generally, this quickly becomes complex (validating a fully defined http item) 233 | * calling systems are probably performing their own validation anyway 234 | * requiring specific properties to be set 235 | * i.e. if http item is defined, requiring hostname to be set 236 | * Some use cases always seem to exist where a property should not be set 237 | """ 238 | 239 | family: Union[str, List[str]] # family or families of malware that was detected 240 | version: Optional[str] = None # version/variant of malware 241 | category: List[CategoryEnum] = [] # capability/purpose of the malware 242 | attack: List[str] = [] # mitre att&ck reference ids, e.g. 'T1129' 243 | 244 | # 245 | # simple config properties 246 | # 247 | 248 | # capabilities of the malware enabled/disabled in config 249 | # note these are probably malware-specific capabilities so no attempt to normalise has been made 250 | # note - av/sandbox detection should be noted by 'detect_' 251 | capability_enabled: List[str] = [] 252 | capability_disabled: List[str] = [] 253 | 254 | campaign_id: List[str] = [] # Server/Campaign Id for malware 255 | identifier: List[str] = [] # UUID/Identifiers for deployed instance 256 | decoded_strings: List[str] = [] # decoded strings from within malware 257 | password: List[str] = [] # Any password extracted from the binary 258 | mutex: List[str] = [] # mutex to prevent multiple instances 259 | pipe: List[str] = [] # pipe name used for communication 260 | sleep_delay: Optional[int] = None # time to sleep/delay execution (milliseconds) 261 | # additional time applied to sleep_delay (milliseconds). 262 | # Jitter implementations can vary but usually it is a value from which a random number is generated and 263 | # added/subtracted to the sleep_delay to make behaviour more unpredictable 264 | sleep_delay_jitter: Optional[int] = None 265 | inject_exe: List[str] = [] # name of executable to inject into 266 | 267 | # configuration or clustering/research data that doesnt fit the other fields 268 | # * rarely used by decoders or specific to one decoder 269 | # to prevent key explosion, the keys must not be dynamically generated 270 | # e.g. api_imports, api_checksums, num_imports, import_hash + many more 271 | # data stored here must always be JSON-serialisable 272 | other: Dict[str, Any] = {} 273 | 274 | # 275 | # embedded binary data 276 | # 277 | class Binary(ForbidModel): 278 | """Binary data extracted by decoder.""" 279 | 280 | class TypeEnum(str, Enum): 281 | """Type of binary data.""" 282 | 283 | payload = "payload" # contained within the original file 284 | config = "config" # sometimes malware uses json/formatted text for config 285 | other = "other" 286 | 287 | datatype: Optional[TypeEnum] = None # what the binary data is used for 288 | data: bytes # binary data, not json compatible 289 | 290 | # other information for the extracted binary rather than the config 291 | # data stored here must always be JSON-serialisable 292 | # e.g. filename, extension, relationship label 293 | other: Dict[str, Any] = {} 294 | 295 | # convenience for ret.encryption.append(ret.Encryption(*properties)) 296 | # Define as class as only way to allow for this to be accessed and not have pydantic try to parse it. 297 | class Encryption(Encryption): 298 | """Encryption usage.""" 299 | 300 | pass 301 | 302 | encryption: Union[List[Encryption], Encryption, None] = None # encryption information for the binary 303 | 304 | binaries: List[Binary] = [] 305 | 306 | # 307 | # communication protocols 308 | # 309 | class FTP(ForbidModel): 310 | """Usage of FTP connection.""" 311 | 312 | username: Optional[str] = None 313 | password: Optional[str] = None 314 | hostname: Optional[str] = None 315 | port: Optional[int] = None 316 | 317 | path: Optional[str] = None 318 | 319 | usage: Optional[ConnUsageEnum] = None 320 | 321 | ftp: List[FTP] = [] 322 | 323 | class SMTP(ForbidModel): 324 | """Usage of SMTP.""" 325 | 326 | # credentials and location of server 327 | username: Optional[str] = None 328 | password: Optional[str] = None 329 | hostname: Optional[str] = None 330 | port: Optional[int] = None 331 | 332 | mail_to: List[str] = [] # receivers 333 | mail_from: Optional[str] = None # sender 334 | subject: Optional[str] = None 335 | 336 | usage: Optional[ConnUsageEnum] = None 337 | 338 | smtp: List[SMTP] = [] # SMTP server for malware 339 | 340 | class Http(ForbidModel): 341 | """Usage of HTTP connection.""" 342 | 343 | # malware sometimes does weird stuff with uris so we don't want to force 344 | # authors to break the uri into username, hostname, path, etc. 345 | # as we lose that information. 346 | # e.g. extra '?' or '/' when unnecessary. 347 | # or something that is technically an invalid uri but still works 348 | uri: Optional[str] = None 349 | 350 | # on the other hand we might not have enough info to construct a uri 351 | protocol: Optional[str] = None # http,https 352 | username: Optional[str] = None 353 | password: Optional[str] = None 354 | hostname: Optional[str] = None # (A host/hostname can be an IP, domain or hostname) 355 | port: Optional[int] = None 356 | path: Optional[str] = None 357 | query: Optional[str] = None 358 | fragment: Optional[str] = None 359 | 360 | user_agent: Optional[str] = None # user agent sent by malware 361 | method: Optional[str] = None # get put delete etc 362 | headers: Optional[Dict[str, str]] = None # custom/additional HTTP headers 363 | max_size: Optional[int] = None 364 | 365 | usage: Optional[ConnUsageEnum] = None 366 | 367 | http: List[Http] = [] 368 | 369 | class SSH(ForbidModel): 370 | """Usage of ssh connection.""" 371 | 372 | username: Optional[str] = None 373 | password: Optional[str] = None 374 | hostname: Optional[str] = None 375 | port: Optional[int] = None 376 | 377 | usage: Optional[ConnUsageEnum] = None 378 | 379 | ssh: List[SSH] = [] 380 | 381 | class Proxy(ForbidModel): 382 | """Usage of proxy connection.""" 383 | 384 | protocol: Optional[str] = None # socks5,http 385 | username: Optional[str] = None 386 | password: Optional[str] = None 387 | hostname: Optional[str] = None 388 | port: Optional[int] = None 389 | 390 | usage: Optional[ConnUsageEnum] = None 391 | 392 | proxy: List[Proxy] = [] 393 | 394 | class ICMP(ForbidModel): 395 | """Usage of ICMP.""" 396 | 397 | type: Optional[int] = None 398 | code: Optional[int] = None 399 | header: Optional[str] = None # Some malware uses non-standard header fields 400 | hostname: Optional[str] = None 401 | 402 | usage: Optional[ConnUsageEnum] = None 403 | 404 | icmp: List[ICMP] = [] 405 | 406 | # 407 | # inter process communication (IPC) 408 | # 409 | class IPC(ForbidModel): 410 | """Usage of named pipe communications.""" 411 | 412 | # A record stored on disk, or a record synthesized on demand by a file 413 | # server, which can be accessed by multiple processes. 414 | file: Optional[List[str]] = None 415 | # Data sent over a network interface, either to a different process on 416 | # the same computer or to another computer on the network. Stream 417 | # oriented (TCP; data written through a socket requires formatting to 418 | # preserve message boundaries) or more rarely message-oriented (UDP, 419 | # SCTP). 420 | socket: Optional[List[str]] = None 421 | # Similar to an internet socket, but all communication occurs within 422 | # the kernel. Domain sockets use the file system as their address 423 | # space. Processes reference a domain socket as an inode, and multiple 424 | # processes can communicate with one socket. 425 | unix_domain_socket: Optional[List[str]] = None 426 | # A file mapped to RAM and can be modified by changing memory 427 | # addresses directly instead of outputting to a stream. This shares 428 | # the same benefits as a standard file. 429 | memory_mapped_file: Optional[Union[bytes, List[str]]] = None 430 | # A data stream similar to a socket, but which usually preserves 431 | # message boundaries. Typically implemented by the operating system, 432 | # they allow multiple processes to read and write to the message queue 433 | # without being directly connected to each other. 434 | message_queue: Optional[List[str]] = None 435 | # A unidirectional data channel using standard input and output. Data 436 | # written to the write-end of the pipe is buffered by the operating 437 | # system until it is read from the read-end of the pipe. Two-way 438 | # communication between processes can be achieved by using two pipes 439 | # in opposite "directions". 440 | anonymous_pipe: Optional[List[str]] = None 441 | # A pipe that is treated like a file. Instead of using standard input 442 | # and output as with an anonymous pipe, processes write to and read 443 | # from a named pipe, as if it were a regular file. 444 | named_pipe: Optional[List[str]] = None 445 | # The process names involved in the IPC communication 446 | process_names: Optional[List[str]] = None 447 | # Multiple processes are given access to the same block of memory, 448 | # which creates a shared buffer for the processes to communicate with 449 | # each other. 450 | shared_memory: Optional[bytes] = None 451 | usage: Optional[ConnUsageEnum] = None 452 | 453 | ipc: List[IPC] = [] # Inter-Process Communications (similar to 'pipe' but more detailed) 454 | 455 | class DNS(ForbidModel): 456 | """Direct usage of DNS.""" 457 | 458 | class RecordTypeEnum(str, Enum): 459 | """DNS record types.""" 460 | 461 | A = "A" 462 | AAAA = "AAAA" 463 | AFSDB = "AFSDB" 464 | APL = "APL" 465 | CAA = "CAA" 466 | CDNSKEY = "CDNSKEY" 467 | CDS = "CDS" 468 | CERT = "CERT" 469 | CNAME = "CNAME" 470 | CSYNC = "CSYNC" 471 | DHCID = "DHCID" 472 | DLV = "DLV" 473 | DNAME = "DNAME" 474 | DNSKEY = "DNSKEY" 475 | DS = "DS" 476 | EUI48 = "EUI48" 477 | EUI64 = "EUI64" 478 | HINFO = "HINFO" 479 | HIP = "HIP" 480 | HTTPS = "HTTPS" 481 | IPSECKEY = "IPSECKEY" 482 | KEY = "KEY" 483 | KX = "KX" 484 | LOC = "LOC" 485 | MX = "MX" 486 | NAPTR = "NAPTR" 487 | NS = "NS" 488 | NSEC = "NSEC" 489 | NSEC3 = "NSEC3" 490 | NSEC3PARAM = "NSEC3PARAM" 491 | OPENPGPKEY = "OPENPGPKEY" 492 | PTR = "PTR" 493 | RRSIG = "RRSIG" 494 | RP = "RP" 495 | SIG = "SIG" 496 | SMIMEA = "SMIMEA" 497 | SOA = "SOA" 498 | SRV = "SRV" 499 | SSHFP = "SSHFP" 500 | SVCB = "SVCB" 501 | TA = "TA" 502 | TKEY = "TKEY" 503 | TLSA = "TLSA" 504 | TSIG = "TSIG" 505 | TXT = "TXT" 506 | URI = "URI" 507 | ZONEMD = "ZONEMD" 508 | 509 | ip: Optional[str] = None 510 | port: Optional[int] = None # The default value is 53 511 | hostname: Optional[str] = None # This is the query hostname 512 | record_type: Optional[RecordTypeEnum] = None # The DNS record type that is queried 513 | usage: Optional[ConnUsageEnum] = None 514 | 515 | dns: List[DNS] = [] # custom DNS address to use for name resolution 516 | 517 | class Connection(ForbidModel): 518 | """Generic TCP/UDP usage.""" 519 | 520 | client_ip: Optional[str] = None 521 | client_port: Optional[int] = None 522 | server_ip: Optional[str] = None 523 | server_domain: Optional[str] = None 524 | server_port: Optional[int] = None 525 | 526 | usage: Optional[ConnUsageEnum] = None 527 | 528 | tcp: List[Connection] = [] 529 | udp: List[Connection] = [] 530 | 531 | # 532 | # complex configuration properties 533 | # 534 | # convenience for ret.encryption.append(ret.Encryption(*properties)) 535 | # Define as class as only way to allow for this to be accessed and not have pydantic try to parse it. 536 | class Encryption(Encryption): 537 | """Encryption usage.""" 538 | 539 | pass 540 | 541 | encryption: List[Encryption] = [] 542 | 543 | class Service(ForbidModel): 544 | """OS service usage by malware.""" 545 | 546 | dll: Optional[str] = None # dll that the service is loaded from 547 | name: Optional[str] = None # service/driver name for persistence 548 | display_name: Optional[str] = None # display name for service 549 | description: Optional[str] = None # description for service 550 | 551 | service: List[Service] = [] 552 | 553 | class Cryptocurrency(ForbidModel): 554 | """Cryptocoin usage (ransomware/miner).""" 555 | 556 | class UsageEnum(str, Enum): 557 | """Cryptocoin usage.""" 558 | 559 | ransomware = "ransomware" # request money to unlock 560 | miner = "miner" # use gpu/cpu to mint coins 561 | other = "other" 562 | 563 | coin: Optional[str] = None # BTC,ETH,USDT,BNB, etc 564 | address: Optional[str] = None 565 | ransom_amount: Optional[float] = None # number of coins required (if hardcoded) 566 | 567 | usage: UsageEnum 568 | 569 | cryptocurrency: List[Cryptocurrency] = [] 570 | 571 | class Path(ForbidModel): 572 | """Path used by malware.""" 573 | 574 | class UsageEnum(str, Enum): 575 | """Purpose of the path.""" 576 | 577 | c2 = "c2" # file/folder issues commands to malware 578 | config = "config" # config is loaded from this path 579 | install = "install" # install directory/filename for malware 580 | plugins = "plugins" # load new capability from this directory 581 | logs = "logs" # location to log activity 582 | storage = "storage" # location to store/backup copied files 583 | other = "other" 584 | 585 | # C:\User\tmp\whatever.txt or /some/unix/folder/path 586 | path: str 587 | usage: Optional[UsageEnum] = None 588 | 589 | paths: List[Path] = [] # files/directories used by malware 590 | 591 | class Registry(ForbidModel): 592 | """Registry usage by malware.""" 593 | 594 | class UsageEnum(str, Enum): 595 | """Registry usage.""" 596 | 597 | persistence = "persistence" # stay alive 598 | store_data = "store_data" # generated encryption keys or config 599 | store_payload = "store_payload" # malware hidden in registry key 600 | read = "read" # read system registry keys 601 | other = "other" 602 | 603 | key: str 604 | usage: Optional[UsageEnum] = None 605 | 606 | registry: List[Registry] = [] 607 | --------------------------------------------------------------------------------