├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── attacker
    ├── FAQ.md
    ├── README.md
    ├── __init__.py
    ├── attacker
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── attacker.py
    │   ├── clientbase.py
    │   ├── ember_model.txt.gz
    │   └── utils
    │   │   ├── __init__.py
    │   │   └── modify.py
    ├── docs
    │   ├── API.md
    │   ├── get_all_sample.md
    │   ├── get_one_sample.md
    │   ├── get_one_zip.md
    │   ├── ml_get_sample.md
    │   ├── ml_submit_sample.md
    │   ├── ml_submit_sample_all.md
    │   └── post_one_zip.md
    └── requirements.txt
└── defender
    ├── Dockerfile
    ├── FAQ.md
    ├── README.md
    ├── __init__.py
    ├── defender
        ├── __init__.py
        ├── __main__.py
        ├── apps.py
        └── models
        │   ├── __init__.py
        │   ├── dummy_model.py
        │   ├── ember_model.py
        │   └── ember_model.txt.gz
    ├── docker-requirements.txt
    ├── requirements.txt
    └── test
        ├── __init__.py
        └── __main__.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | ---
 3 | page_type: sample
 4 | languages:
 5 | - python
 6 | description: "2020 Machine Learning Security Evasion Competition Sample Code"
 7 | urlFragment: "Azure/2020-machine-learning-security-evasion-competition"
 8 | ---
 9 | -->
10 | 
11 | # 2020 Machine Learning Security Evasion Competition
12 | 
13 | <!-- 
14 | Guidelines on README format: https://review.docs.microsoft.com/help/onboard/admin/samples/concepts/readme-template?branch=master
15 | 
16 | Guidance on onboarding samples to docs.microsoft.com/samples: https://review.docs.microsoft.com/help/onboard/admin/samples/process/onboarding?branch=master
17 | 
18 | Taxonomies for products and languages: https://review.docs.microsoft.com/new-hope/information-architecture/metadata/taxonomies?branch=master
19 | -->
20 | 
21 | This repository contains code samples for the 2020 Machine Learning Security Evasion Competition.  Participants must register at [https://mlsec.io](https://mlsec.io) and accept the terms of service in order to participate.
22 | 
23 | ## Dates
24 | | Challenge         | Start Date                  |  End Date          |
25 | |-------------------|-----------------------------|--------------------|
26 | | [defender](https://github.com/Azure/2020-machine-learning-security-evasion-competition/tree/master/defender)   | Jun 15, 2020 (AoE) | Jul 23, 2020 (AoE) |
27 | | [attacker](https://github.com/Azure/2020-machine-learning-security-evasion-competition/tree/master/attacker)   | Aug 6, 2020 (AoE) | Sep 18, 2020 (AoE) |
28 | 
29 | *start and end times are Anywhere on Earth (AoE)
30 | 
31 | 
32 | ## Contents
33 | 
34 | Outline the file contents of the repository. It helps users navigate the codebase, build configuration and any related assets.
35 | 
36 | | File/folder       | Description                                    |
37 | |-------------------|------------------------------------------------|
38 | | `defender`        | Sample source code for the defender challenge. |
39 | | `attacker`        | Sample source code for the attacker challenge. |
40 | | `README.md`       | This README file.                              |
41 | | `LICENSE`         | The license for the sample code.               |
42 | | `CODE_OF_CONDUCT.md` | Microsoft's open source code of conduct. |
43 | | `SECURITY.md` | Reporting security issues. |
44 | 
45 | 
46 | ## Contributing
47 | 
48 | This project welcomes contributions and suggestions, during or after the competition.  Most contributions require you to agree to a
49 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
50 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
51 | 
52 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
53 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
54 | provided by the bot. You will only need to do this once across all repos using our CLA.
55 | 
56 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
57 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
58 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
59 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/attacker/FAQ.md:
--------------------------------------------------------------------------------
 1 | # Frequently Asked Questions
 2 | 
 3 | <!-- vscode-markdown-toc -->
 4 | * [How does the sample `attacker` solution work?](#how-does-the-sample-`attacker`-solution-work?)
 5 | * [Can I use adversarial ML toolboxes?](#can-i-use-adversarial-ml-toolboxes?)
 6 | 
 7 | <!-- vscode-markdown-toc-config
 8 | 	numbering=false
 9 | 	autoSave=true
10 | 	/vscode-markdown-toc-config -->
11 | <!-- /vscode-markdown-toc -->
12 | 
13 | 
14 | ## <a name='how-does-the-sample-`attacker`-solution-work?'></a>How does the sample `attacker` solution work?
15 | The [example solution](attacker/__main__.py) contains code that produces evasive variants. However, it is possible that
16 | 1. some of the samples produced may not be functional afterall, or
17 | 2. no evasive variant is discovered for a set of samples.
18 | 
19 | Thus, manual inspection and manipulation of samples may be required.
20 | 
21 | The [HyperOptAttacker class](attacker/attacker.py#L44) in the example code uses the following strategies:
22 | * [Sequential model-based global optimization](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf) explores a space of file modifications that decreases the model score, keeping a history of successful and failed strategies via a modeled surrogate of the objective that's updated over time.
23 | * [Functionality-preserving file modifications](attacker/utils/modify.py) should (mostly) preserve the functionality of the files. The following parts of a file can be modified with the example code:
24 |   - add new sections with benign content
25 |   - add additional (unused) imports
26 |   - append data to the file (the overlay)
27 |   - modify the timestamp
28 | * In a generalization of a "happy-strings" attack, content is only _added_ to a file. The content to add is scraped from a set of benign files that the user specifies.
29 | 
30 | For the optimization, we use the [Tree of Parzen estimators (TPE)](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf) algorithm built into [hyperopt](https://github.com/hyperopt/hyperopt). To make the problem a tractable optimization problem, we employed the following strategies:
31 | * Parameterize the file modifications. 
32 |   -  For sections and imports, the optimizer may choose _how many_ sections / imports to add. 
33 |   -  For sections, imports, and overlays, the optimizer may choose _which_ benign source to copy from (parameterized by an index) and _how many_ bytes to copy to the target (parameterized by a percentage).
34 |   - A new timestamp may be selected between the minimum and maximum timestamp observed in the benign set (parameterized by a percentage).
35 | * Derive _scores_ from the hard-labeled model outputs to guide the optimization.
36 |   - We average the output of hard-label models. 
37 |   - Additionally, we include in the average the _score_ of a barebones [ember](https://github.com/endgameinc/ember) model, so that the objective function outputs a number between zero and one.
38 | 
39 | 
40 | ## <a name='can-i-use-adversarial-ml-toolboxes?'></a>Can I use adversarial ML toolboxes?
41 | Short answer: Maybe! Black-box approaches are the appropriate, but be aware that many were designed for images, with norm constraints on the input that may not be suitable for malware.
42 | 
43 | The [Adversarial Robustness Toolbox](https://github.com/Trusted-AI/adversarial-robustness-toolbox) includes a number of [black-box attacks](https://github.com/Trusted-AI/adversarial-robustness-toolbox/wiki/ART-Attacks#12-black-box). The implementation and constraints in these attacks are primarily tailored for attacking image models.
44 | 
45 | [SecML](https://gitlab.com/secml/secml) includes attacks and corresponding defenses for several methods under both white-box and black-box threat models.
46 | 
47 | [CleverHans](https://github.com/tensorflow/cleverhans) focuses primarily on white-box generation of adversarial examples for images, but may provide some guidance in your attack.


--------------------------------------------------------------------------------
/attacker/README.md:
--------------------------------------------------------------------------------
  1 | # Attacker Challenge
  2 | <!-- vscode-markdown-toc -->
  3 | * [Overview](#overview)
  4 |     * [Challenge Dates](#challenge-dates)
  5 |     * [Rules / Terms](#rules-/-terms)
  6 |     * [Requirements](#requirements)
  7 | * [Sample solution](#sample-solution)
  8 | * [Resources](#resources)
  9 | 
 10 | <!-- vscode-markdown-toc-config
 11 | 	numbering=false
 12 | 	autoSave=true
 13 | 	/vscode-markdown-toc-config -->
 14 | <!-- /vscode-markdown-toc -->
 15 | 
 16 | 
 17 | ## <a name='overview'></a>Overview
 18 | 
 19 | ### <a name='challenge-dates'></a>Challenge Dates
 20 | Aug 6 - Sep 18, 2020 (AoE)
 21 | 
 22 | ### <a name='rules-/-terms'></a>Rules / Terms
 23 | [https://mlsec.io/tos](https://mlsec.io/tos)
 24 | 
 25 | ### <a name='requirements'></a>Requirements
 26 | A valid submission for the attacker challenge consists of the following:
 27 | 1. a ZIP file containing modified malware samples with their original names (`001`, `002`, etc.)
 28 | 2. samples in the ZIP file have been verified as functional in a [Windows 10 Sandbox (disable networking!)](https://developer.microsoft.com/en-us/microsoft-edge/tools/vms/)
 29 | 
 30 | ## <a name='sample-solution'></a>Sample solution
 31 | <span style="color:red">**Only run this sample solution on a Linux virtual machine. It will write novel, functional malware samples to disk.**</span>
 32 | 
 33 | The example solution is intended to simplify creating evasive malware samples that are functional using a semi-automated process.  After running this solution, it is possible that
 34 | 1. some of the samples produced may not be functional afterall, or
 35 | 2. no evasive variant is discovered for a set of samples.
 36 | Thus, manually verifying that samples are functional, and manually manipulating some samples to evade machine learning may still be required.
 37 | 
 38 | A sample solution that you may modify is included in the [attacker](attacker/) folder. (See the [FAQ](FAQ.md#the-example-solution) for an overview of the example solution.)
 39 | 
 40 | **Install UPX**
 41 | 
 42 | Download the latest release [here](https://github.com/upx/upx/releases/tag/v3.96), required for this sample solution.  
 43 | 
 44 | **Install Python requirements using `pip`**
 45 | 
 46 | ```
 47 | pip install -r attacker/requirements.txt
 48 | ```
 49 | 
 50 | **Initialize the attack**
 51 | 
 52 | In the example attack, PE file content is extracted from a list of benign files that you provide.  You must also provide the `api_token` obtained from [https://mlsec.io/myuser](https://mlsec.io/myuser/).  From the root folder, run
 53 | ```
 54 | python -m attacker.attacker init --benign ~/data/benign/ --api_token 0123456789abcdef0123456789abcdef -o config.pkl
 55 | ```
 56 | 
 57 | **Offline attack: Discover evasive candidates**
 58 | 
 59 | The sample solution first attacks a _local_ black-box model that you must run.  We will use the defended ember model, which is identical to the `ember` model hosted for the competition.  Since the black-box models are report hard labels, as a heuristic, we'll average this score with a local version of (undefended) ember which reports a score.  This will help our optimization approach discover which file modifications might be fruitful, even if they do not result in a benign label.
 60 | 
 61 | Run the defended ember model in a separate terminal. (For more information, see the [defender documentation](../defender/README.md)):
 62 | ```
 63 | pip install -r defender/requirements.txt
 64 | cd defender
 65 | python -m defender
 66 | ```
 67 | 
 68 | Run the attack script against the model that is now being served locally, storing the samples in a new folder, `pass1`.  Those that bypass the local model will be stored in `pass1/success`, while those that do not will be stored in `pass1/failure`.
 69 | ```
 70 | python -m attacker.attacker run --config config.pkl --samples ~/data/MLSEC_samples.zip --success_out pass1/success --failure_out pass1/failure --max-evals 10
 71 | ```
 72 | 
 73 | We have allowed only 10 queries per malware sample via `--max-evals 10`. To continue exploring the space of file modifications for a universal bypass, one may optionally _resume_ optimizing via
 74 | ```
 75 | python -m attacker.attacker run --config config.pkl --samples pass1/failure --success_out pass2/success --failure_out pass2/failure --max-evals 10
 76 | ```
 77 | 
 78 | **Online attack: Discover evasive candidates**
 79 | After having collected a number of samples that evade the offline `defender` module, use them as seeds in an online attack by include the flag `--online`:
 80 | 
 81 | ```
 82 | python -m attacker.attacker run --config config.pkl --samples candidates/ --success_out online_pass1/success --failure_out online_pass1/failure --max-evals 10 --online
 83 | ```
 84 | 
 85 | As above, this process can be repeated for failed samples:
 86 | 
 87 | ```
 88 | python -m attacker.attacker run --config config.pkl --samples online_pass1/failure --success_out online_pass2/success --failure_out online_pass2/failure --max-evals 10 --online
 89 | ```
 90 | 
 91 | **Manual manipulations and testing**
 92 | 
 93 | One may use this process to collect a subset of samples that evade one more more hosted machine learning models.  Additional steps that should be done manually include the following:
 94 | 1. additional modifications to samples that do not yet evade all three machine learning models
 95 | 2. validate that the generated samples maintain functionality when run in a Windows 10 sandbox
 96 | 3. collect completed set into a ZIP file and upload at [https://mlsec.io/zipfile](https://mlsec.io/zipfile)
 97 |    - only one ZIP upload per 60 minutes is allowed per user
 98 |    - one should validate that the submitted samples are _still_ evasive--the hosted machine learning models may have changed state since the time of initial discovery
 99 | 4. analyze samples that do not pass one or more of the validation steps
100 | 5. repeat this process as needed
101 | 
102 | 
103 | ## <a name='resources'></a>Resources
104 | For additional questions, the following resources are available:
105 | * [REST API Interface](docs/API.md) API documentation for submitting samples and uploading ZIP files
106 | * [Frequently Asked Questions](FAQ.md) markdown file with solutions to common problems
107 | * [Join the Slack channel](https://join.slack.com/t/evademalwareml/shared_invite/zt-9birv1qf-KJFEiyLLRVtrsNDuyA0clA) to interact with other contestants
108 | * [Submit an issue](https://github.com/Azure/2020-machine-learning-security-evasion-competition/issues) for issues relating to the sample code


--------------------------------------------------------------------------------
/attacker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/2020-machine-learning-security-evasion-competition/bfe1fe9d666d8679e3a4b8797d6e2cba169fd138/attacker/__init__.py


--------------------------------------------------------------------------------
/attacker/attacker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/2020-machine-learning-security-evasion-competition/bfe1fe9d666d8679e3a4b8797d6e2cba169fd138/attacker/attacker/__init__.py


--------------------------------------------------------------------------------
/attacker/attacker/__main__.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import requests
  3 | import sys
  4 | # import toml
  5 | import os
  6 | from defender.test import file_bytes_generator, MAXFILESIZE
  7 | from .attacker import EmberGuidedBlackBox, HyperOptAttacker
  8 | import pickle
  9 | import logging
 10 | 
 11 | logging.basicConfig(level=logging.WARNING)
 12 | loggers_to_silence = [
 13 |     "hyperopt.tpe",
 14 |     "hyperopt.fmin",
 15 |     "hyperopt.pyll.base",
 16 |     "urllib3.connectionpool",
 17 | ]
 18 | for logger in loggers_to_silence:
 19 |     logging.getLogger(logger).setLevel(logging.ERROR)
 20 | 
 21 | attack = None
 22 | model = None
 23 | 
 24 | 
 25 | @click.group()
 26 | def cli():
 27 |     pass
 28 | 
 29 | 
 30 | @click.command()
 31 | @click.option('--benign', required=True, type=str, help='folder containing benign samples')
 32 | @click.option('--api_token', required=True, type=str, help='api token')
 33 | @click.option('-o', required=True, type=str, help='output pickle file containing configuration data')
 34 | def init(benign, api_token, o):
 35 |     global attack
 36 |     global model
 37 |     model = EmberGuidedBlackBox(api_token=api_token, model_gz_path='defender/defender/models/ember_model.txt.gz')
 38 |     attack = HyperOptAttacker(model, benign)
 39 |     with open(o, 'wb') as outfile:
 40 |         pickle.dump({'model': model, 'attack': attack}, outfile)
 41 | 
 42 | 
 43 | @click.command()
 44 | @click.option('--config', required=True, type=str, help='path of config file (python pickle file) created in init step')
 45 | @click.option('--samples', required=True, type=str, help='folder or file containing malicious samples')
 46 | @click.option('--success_out', required=True, type=str, help='folder to store evasive variants the bypass the models (will create if necessary)')
 47 | @click.option('--failure_out', required=True, type=str, help='folder to store samples that only partiall bypass the models (will create if necessary)')
 48 | @click.option('--max-evals', type=int, help='maximum queries to allow', default=250)
 49 | @click.option('--local-server', type=str, help='URL for local black-box server to attack', default='http://127.0.0.1:8080/')
 50 | @click.option('--online', is_flag=True, default=False, help='attack local or online model')
 51 | def run(config, samples, success_out, failure_out, max_evals, local_server, online):
 52 |     global attack
 53 |     global model
 54 |     with open(config, 'rb') as infile:
 55 |         dat = pickle.load(infile)
 56 |         model = dat['model']
 57 |         attack = dat['attack']
 58 |     print(f'read config from {config}')
 59 | 
 60 |     model.online(online)  # set online or offline
 61 | 
 62 |     model_names = model.model.models if online else ['local']
 63 | 
 64 |     threshold = 1.0 / (len(model_names) + 1)  # quit if all blackbox models report label=0
 65 | 
 66 |     # create output folder if necessary
 67 |     os.makedirs(success_out, exist_ok=True)
 68 |     os.makedirs(failure_out, exist_ok=True)
 69 | 
 70 |     for fn, bytez in file_bytes_generator(samples, MAXFILESIZE):
 71 |         print(fn)
 72 |         # attempt to restory any history associated with this file
 73 |         history_fn = fn + '.history.pkl'
 74 |         if os.path.exists(history_fn):  # won't work with original ZIP file
 75 |             with open(history_fn, 'rb') as infile:
 76 |                 history = pickle.load(infile)
 77 |         else:
 78 |             history = None
 79 | 
 80 |         preds, newbytez, history = attack.attack(bytez, max_evals=max_evals, history=history, threshold=threshold)
 81 |         # preds contains [model1, model2, ..., local_ember], where the last is our local heuristic
 82 |         # newbytez contains a sample with the best-scoring modifications implemented
 83 | 
 84 |         bypassed = [p == 0 for p in preds[:-1]]
 85 |         history['bypassed'] = dict(zip(model_names, bypassed))
 86 | 
 87 |         outfname = os.path.join(success_out if all(bypassed) else failure_out, os.path.basename(fn))
 88 |         out_history_fn = outfname + '.history.pkl'
 89 | 
 90 |         # write best candidate file
 91 |         with open(outfname, 'wb') as outfile:
 92 |             outfile.write(newbytez if all(bypassed) else bytez)  # if failed, write the original samples
 93 | 
 94 |         # write history, for possible re-use
 95 |         with open(out_history_fn, 'wb') as outfile:
 96 |             pickle.dump(history, outfile)
 97 | 
 98 | 
 99 | cli.add_command(init)
100 | cli.add_command(run)
101 | 
102 | if __name__ == '__main__':
103 |     cli()


--------------------------------------------------------------------------------
/attacker/attacker/attacker.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import lief
  3 | from collections import defaultdict, Counter
  4 | import numpy as np
  5 | from hyperopt import hp, pyll, fmin, tpe, Trials, STATUS_OK, STATUS_FAIL
  6 | import pickle
  7 | from attacker.attacker.utils.modify import PEFileModifier
  8 | from attacker.attacker.clientbase import BlackBoxOfflineClient, BlackBoxOnlineClient
  9 | from defender.defender.models.ember_model import EmberModel
 10 | import logging
 11 | 
 12 | MAXFILESIZE = 2**21
 13 | 
 14 | 
 15 | class EmberGuidedBlackBox(object):
 16 |     def __init__(self, api_token, model_gz_path, online=False):
 17 |         self.api_token = api_token
 18 |         self.online(online)
 19 |         self.ember = EmberModel(model_gz_path)
 20 | 
 21 |     def online(self, online=False):
 22 |         self.is_online = online
 23 |         if online:
 24 |             self.model = BlackBoxOnlineClient(self.api_token)
 25 |         else:
 26 |             self.model = BlackBoxOfflineClient()
 27 | 
 28 |     def predict_models(self, bytez):
 29 |         # get score from local ember model
 30 |         try:
 31 |             score = self.ember.predict_proba(bytez)
 32 |         except Exception as e:
 33 |             logging.warning(e)
 34 |             score = 1.0
 35 | 
 36 |         # get predictions from online/offline model(s)
 37 |         predictions = self.model.predict(bytez)
 38 | 
 39 |         all_predictions = predictions + [score]
 40 | 
 41 |         return all_predictions
 42 | 
 43 | 
 44 | class HyperOptAttacker(object):
 45 |     ''' uses hyperopt's Tree Parzen Estimator (TPE) for black-box optimization
 46 |     of a parameter space that consists of function-preserving file modifications '''
 47 | 
 48 |     def __init__(self, classifier, benign_folder):
 49 |         assert hasattr(classifier, 'predict_models'), 'expecting "classifier" to have a predict_models method'
 50 |         self.classifier = classifier
 51 | 
 52 |         # initialize from files
 53 |         self.initialize_benign_content(benign_folder)
 54 | 
 55 |         # initialize optimization space
 56 |         MAX_SECTIONS = 20
 57 |         MAX_LIBRARIES = 20
 58 |         section_opts = {f's{s}': hp.choice(f's{s}', [None, {'idx': hp.randint(f's_idx_{s}', len(self.sections)),
 59 |                                                             'pct': hp.uniform(f's_pct_{s}', 0, 1)}])
 60 |                         for s in range(MAX_SECTIONS)}
 61 | 
 62 |         import_opts = {f'i{s}': hp.choice(f'i{s}', [None, {'idx': hp.randint(f'i_idx_{s}', len(self.imports)),
 63 |                                                            'pct': hp.uniform(f'i_pct_{s}', 0, 1)}])
 64 |                        for s in range(MAX_LIBRARIES)}
 65 | 
 66 |         overlay_opts = hp.choice('overlay_info', [None, {'idx': hp.randint('o_idx', len(self.overlays)),
 67 |                                                          'pct': hp.uniform('o_pct', 0, 1)}])
 68 | 
 69 |         self.space = {
 70 |             'section_info': section_opts,
 71 |             'import_info': import_opts,
 72 |             'overlay_info': overlay_opts,
 73 |             'modify_timestamp': hp.choice('modify_timestamp', [None, {'pct': hp.uniform('t_pct', 0, 1)}]),
 74 |             'upx_unpack': hp.choice('upx_unpack', [False, True])
 75 |         }
 76 | 
 77 |     def initialize_benign_content(self, benign_folder):
 78 |         sections = []
 79 |         overlays = []
 80 |         imports = defaultdict(set)
 81 |         timestamps = []
 82 | 
 83 |         for fn in glob.glob(f'{benign_folder}/*'):
 84 |             print(fn)
 85 |             pe = lief.parse(fn)
 86 |             if not pe:
 87 |                 continue
 88 |             for s in pe.sections:
 89 |                 sections.append((s.name, s.characteristics, bytes(s.content)))
 90 |             for i in pe.imports:
 91 |                 for f in i.entries:
 92 |                     imports[i.name].add(f.name)
 93 |             timestamps.append(pe.header.time_date_stamps)
 94 |             overlays.append(bytes(pe.overlay))
 95 | 
 96 |         imports = [(k, list(v)) for k, v in imports.items()]
 97 | 
 98 |         # let's sort by content length
 99 |         sections.sort(key=lambda x: len(x[2]), reverse=True)
100 |         overlays.sort(key=lambda x: len(x), reverse=True)
101 |         imports.sort(key=lambda x: len(x[1]), reverse=True)
102 |         timestamps = [min(timestamps), max(timestamps)]
103 | 
104 |         # let's filter sections
105 |         from collections import Counter
106 | 
107 |         def updatecounter(k, counter):
108 |             counter.update([k])
109 |             return counter[k]
110 | 
111 |         scounter = Counter()
112 |         sections = [s for s in sections if updatecounter(f'{s[0]}{s[1]}', scounter) <= 2]  # how many of each name/characteristics?
113 |         overlays = [o for o in overlays if len(o) >= 1024]
114 |         imports = [i for i in imports if len(i[1]) >= 5]
115 | 
116 |         self.timestamps = timestamps
117 |         self.sections = sections
118 |         self.overlays = overlays
119 |         self.imports = imports
120 | 
121 |     def attack(self, bytez, max_evals=250, threshold=0.5, history=None):
122 |         if history is None:
123 |             history = {'trials': Trials(), 'evals': 0}
124 | 
125 |         assert 'trials' in history and 'evals' in history, f'expecting "trials" and "evals" in "history" dict'
126 | 
127 |         def modify(bytez, space):
128 |             # using global bytez
129 |             modpe = PEFileModifier(bytez)
130 | 
131 |             # upx packing comes first
132 |             if space['upx_unpack']:
133 |                 modpe.upx_unpack()
134 | 
135 |             # add some sections
136 |             for _, v in space['section_info'].items():
137 |                 if v:
138 |                     name, char, cont = self.sections[v['idx']]
139 |                     _end = int(v['pct'] * len(cont))
140 |                     modpe.add_section(name, char, cont[:_end])
141 | 
142 |             # add some imports
143 |             for _, v in space['import_info'].items():
144 |                 if v:
145 |                     lib, funcs = self.imports[v['idx']]
146 |                     _end = int(v['pct'] * len(funcs))
147 |                     modpe.add_imports(lib, funcs[:_end])
148 | 
149 |             # add to the overlay
150 |             if space['overlay_info']:
151 |                 v = space['overlay_info']
152 |                 cont = self.overlays[v['idx']]
153 |                 _end = int(v['pct'] * len(cont))
154 |                 modpe.append_overlay(cont[:_end])
155 | 
156 |             # modify timestamp
157 |             if space['modify_timestamp']:
158 |                 pct = space['modify_timestamp']['pct']
159 |                 t = int((1 - pct) * self.timestamps[0] + pct * self.timestamps[1])
160 |                 modpe.set_timestamp(t)
161 | 
162 |             # score the function...first check limits
163 |             return modpe.content
164 | 
165 |         # define function to optimize
166 |         def f(space):
167 |             new_bytez = modify(bytez, space)
168 | 
169 |             if len(new_bytez) > MAXFILESIZE:
170 |                 return {
171 |                     "loss": len(new_bytez) / MAXFILESIZE,  # a number larger than 1
172 |                     "status": STATUS_FAIL,
173 |                     "space": space
174 |                 }
175 | 
176 |             predictions = list(self.classifier.predict_models(new_bytez))
177 | 
178 |             return {
179 |                 "loss": float(np.mean(predictions)),
180 |                 "pred": predictions,
181 |                 "status": STATUS_OK,
182 |                 "space": space
183 |             }
184 | 
185 |         # minimize the function
186 |         fmin(
187 |             fn=f,
188 |             space=self.space,
189 |             algo=tpe.suggest,
190 |             trials=history['trials'],
191 |             max_evals=history['evals'] + max_evals,
192 |             loss_threshold=threshold,  # terminate early if the loss drops below this value
193 |         )
194 | 
195 |         # how many iterations were actually taken?
196 |         history['evals'] = len(history['trials'])
197 | 
198 |         # did we actually result in evasion?
199 |         preds = history['trials'].best_trial['result']['pred']
200 | 
201 |         space = history['trials'].best_trial['result']['space']
202 |         # generate the file from the space
203 |         newbytez = modify(bytez, space)
204 | 
205 |         # return vector of predictions and the modified file that made them so
206 |         return preds, newbytez, history
207 | 


--------------------------------------------------------------------------------
/attacker/attacker/clientbase.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import requests
 3 | import sys
 4 | import time
 5 | import json
 6 | 
 7 | 
 8 | MLSEC_SUBMIT_API = 'https://api.mlsec.io/api/ml_submit_sample?api_token={api_token}&model={model}'  # POST
 9 | MLSEC_RETRIEVE_API = 'https://api.mlsec.io/api/ml_get_sample?api_token={api_token}&jobid={jobid}'  # GET
10 | ALL_MODELS = ['ember', 'needforspeed', 'domumpqb']
11 | SLEEP_DURATION = 1000
12 | 
13 | 
14 | class BlackBoxOfflineClient(object):
15 |     def __init__(self, url='http://127.0.0.1:8080'):
16 |         self.url = url
17 | 
18 |     def predict(self, bytez):
19 |         resp = requests.post(self.url, data=bytez, headers={'Content-Type': 'application/octet-stream'})
20 |         json = resp.json()
21 |         return [json['result']]
22 | 
23 | 
24 | class BlackBoxOnlineClient(object):
25 |     def __init__(self, api_token, models=ALL_MODELS, post_url=MLSEC_SUBMIT_API, get_url=MLSEC_RETRIEVE_API):
26 |         self.api_token = api_token
27 |         self.post_url = post_url
28 |         self.get_url = get_url
29 |         self.models = models
30 | 
31 |     def predict(self, bytez, models=None, wait_for=SLEEP_DURATION):
32 |         if models is None:
33 |             models = self.models
34 |         for m in models:
35 |             assert m in ALL_MODELS, f"invalid model {m} specified"
36 | 
37 |         mstr = ",".join(models)
38 |         resp = requests.post(self.post_url.format(api_token=self.api_token, model=mstr),
39 |                              data=bytez,
40 |                              headers={'Content-Type': 'application/octet-stream'})
41 |         assert resp.ok, f'POST error {resp.status_code}: {resp.text}'
42 | 
43 |         # retrieve jobid
44 |         try:
45 |             r = resp.json()
46 |             jobid = r['jobid']
47 |         except (json.JSONDecodeError, KeyError) as e:
48 |             raise(e)
49 | 
50 |         # retry if not ready
51 |         while True:
52 |             resp = requests.get(self.get_url.format(api_token=self.api_token, jobid=jobid))
53 |             if resp.status_code != 202:
54 |                 break
55 |             time.sleep(wait_for)
56 | 
57 |         assert resp.ok, f'GET error {resp.status_code}: {resp.text}'
58 |         r = resp.json()
59 |         return [r[m]['result'] for m in models]  # return list of respones for queried models
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     # first, start a docker image using
64 |     # docker run -itp 8080:8080 ember
65 | 
66 |     model = BlackBoxOfflineClient()
67 |     from defender.test import TINYIMPORT
68 |     print(model.predict(TINYIMPORT))
69 | 


--------------------------------------------------------------------------------
/attacker/attacker/ember_model.txt.gz:
--------------------------------------------------------------------------------
1 | ../../defender/defender/models/ember_model.txt.gz


--------------------------------------------------------------------------------
/attacker/attacker/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/2020-machine-learning-security-evasion-competition/bfe1fe9d666d8679e3a4b8797d6e2cba169fd138/attacker/attacker/utils/__init__.py


--------------------------------------------------------------------------------
/attacker/attacker/utils/modify.py:
--------------------------------------------------------------------------------
  1 | import lief
  2 | import random
  3 | import tempfile
  4 | import os
  5 | import subprocess
  6 | 
  7 | 
  8 | class PEFileModifier(object):
  9 |     def __init__(self, file_content: bytes):
 10 |         self.bytez = file_content
 11 |         self.packed_section_names = {'.aspack', '.adata', 'ASPack', '.ASPack', '.boom', '.ccg', '.charmve', 'BitArts', 'DAStub',
 12 |                                      '!EPack', '.ecode', '.edata', '.enigma1', '.enigma2', '.enigma2', '.FSG!', '.gentee', 'kkrunchy',
 13 |                                      'lz32.dll', '.mackt', '.MaskPE', 'MEW', '.mnbvcx1', '.mnbvcx2', '.MPRESS1', '.MPRESS2', '.neolite',
 14 |                                      '.neolit', '.nsp1', '.nsp0', '.nsp2', 'nps1', 'nps0', 'nps2', '.packed', 'pebundle', 'PEBundle',
 15 |                                      'PEC2TO', 'PECompact2', 'PE2', 'pec', 'pec1', 'pec2', 'pec2', 'pec3', 'pec4', 'pec5', 'pec6',
 16 |                                      'PEC2MO', 'PELOCKnt', '.perplex', 'PESHiELD', '.petite', '.pinclie', 'ProCrypt', '.RLPack',
 17 |                                      '.rmnet', 'RCryptor', '.RPCrypt', '.seau', ',sforce3', '.shrink1', '.shrink2', '.shrink3',
 18 |                                      '.spack', '.svkp', 'Themida', '.Themida', '.taz', '.tsuarch', '.tsustub', '.packed', 'PEPACK!!',
 19 |                                      '.Upack', '.ByDwing', 'UPX0', 'UPX1', 'UPX2', 'UPX3', 'UPX!', '.UPX0', '.UPX1', '.UPX2',
 20 |                                      '.vmp0', '.vmp1', '.vmp2', 'VProtect', '.winapi', 'WinLicen', '_winzip_', '.WWPACK', 'WWP32', '.yP', '.y0da'}
 21 | 
 22 |     def _build(self, pe, imports=False):
 23 |         builder = lief.PE.Builder(pe)
 24 |         if imports:
 25 |             # patch the original import table in order to redirect functions to the new import table
 26 |             builder.build_imports(True).patch_imports(True)  
 27 |         builder.build()
 28 |         return builder.get_build()
 29 | 
 30 |     def _ispacked(self, pe):
 31 |         for s in pe.sections:
 32 |             if s.name in self.packed_section_names:
 33 |                 return True
 34 |         return False
 35 | 
 36 |     def _section_rename_if_exists(self, pe, section_name, target_name):
 37 |         for s in pe.sections:
 38 |             if s.name == section_name:
 39 |                 break
 40 |         if s.name == section_name:
 41 |             s.name = target_name
 42 | 
 43 |     def add_section(self, section_name: str, characteristics: int, section_content: bytes):
 44 |         pe = lief.parse(raw=self.bytez)
 45 |         if self._ispacked(pe):
 46 |             return  # don't mess with sections if the file is packed
 47 |         replace_name = '.' + ''.join(list(map(chr, [random.randint(ord('a'), ord('z')) for _ in range(6)])))  # example: .nzomcu
 48 |         self._section_rename_if_exists(pe, section_name, replace_name)  # rename if exists
 49 |         section = lief.PE.Section(name=section_name, content=list(section_content), characteristics=characteristics)
 50 |         pe.add_section(section, lief.PE.SECTION_TYPES.UNKNOWN)
 51 |         self.bytez = self._build(pe)
 52 | 
 53 |     def rename_section_(self, section_name: str, target_name: str):
 54 |         pe = lief.parse(raw=self.bytez)
 55 |         if self._ispacked(pe):
 56 |             return  # don't mess with sections if the file is packed
 57 |         self._section_rename_if_exists(pe, section_name, target_name)  # rename if exists
 58 |         self.bytez = self._build(pe)  # idempotent if the section doesn't exist
 59 | 
 60 |     def set_timestamp(self, timestamp: int):
 61 |         pe = lief.parse(raw=self.bytez)
 62 |         pe.header.time_date_stamps = timestamp
 63 |         self.bytez = self._build(pe)
 64 | 
 65 |     def append_overlay(self, content: bytes):
 66 |         self.bytez += content
 67 | 
 68 |     def add_imports(self, library, functions):
 69 |         pe = lief.parse(raw=self.bytez)
 70 |         lib = pe.add_library(library)
 71 |         for f in functions:
 72 |             lib.add_entry(f)
 73 |         self.bytez = self._build(pe)
 74 | 
 75 |     def upx_unpack(self):
 76 |         # dump to a temporary file
 77 |         tmpfilename = os.path.join(
 78 |             tempfile._get_default_tempdir(), next(tempfile._get_candidate_names()))
 79 | 
 80 |         with open(tmpfilename, 'wb') as outfile:
 81 |             outfile.write(self.bytez)
 82 | 
 83 |         # test with upx -t
 84 |         with open(os.devnull, 'w') as DEVNULL:
 85 |             retcode = subprocess.call(
 86 |                 ['upx', tmpfilename, '-t'], stdout=DEVNULL, stderr=DEVNULL
 87 |             )
 88 | 
 89 |         if retcode == 0:
 90 |             with open(os.devnull, 'w') as DEVNULL:
 91 |                 retcode = subprocess.call(
 92 |                     ['upx', tmpfilename, '-d', '-o', tmpfilename + '_unpacked'], stdout=DEVNULL, stderr=DEVNULL
 93 |                 )
 94 |             if retcode == 0:
 95 |                 with open(tmpfilename + '_unpacked', 'rb') as result:
 96 |                     self.bytez = result.read()
 97 | 
 98 |         os.unlink(tmpfilename)
 99 | 
100 |         return
101 | 
102 |     @property
103 |     def content(self):
104 |         return bytes(self.bytez)
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     import glob
109 |     import lief
110 |     from collections import defaultdict
111 |     sections = []
112 |     overlays = []
113 |     imports = defaultdict(set)
114 |     timestamps = []
115 | 
116 |     for fn in glob.glob('/data/samples/validation/benign/*'):
117 |         pe = lief.parse(fn)
118 |         for s in pe.sections:
119 |             sections.append((s.name, s.characteristics, bytes(s.content)))
120 |         for i in pe.imports:
121 |             for f in i.entries:
122 |                 imports[i.name].add(f.name)
123 |         timestamps.append(pe.header.time_date_stamps)
124 |         overlays.append(bytes(pe.overlay))
125 | 
126 |     imports = [(k, list(v)) for k, v in imports.items()]
127 | 
128 |     # let's sort by content length
129 |     sections.sort(key=lambda x: len(x[2]), reverse=True)
130 |     overlays.sort(key=lambda x: len(x), reverse=True)
131 |     imports.sort(key=lambda x: len(x[1]), reverse=True)
132 |     timestamps = [min(timestamps), max(timestamps)]
133 | 
134 |     # let's filter sections
135 |     from collections import Counter
136 | 
137 |     def updatecounter(k, counter):
138 |         counter.update([k])
139 |         return counter[k]
140 | 
141 |     scounter = Counter()
142 |     sections = [s for s in sections if updatecounter(f'{s[0]}{s[1]}', scounter) <= 2]  # how many of each name/characteristics?
143 | 
144 |     overlays = [o for o in overlays if len(o) >= 1024]
145 | 
146 |     imports = [i for i in imports if len(i[1]) >= 5]
147 | 
148 |     # open a file and modify it
149 |     malicious_fn = '/data/samples/validation/malicious/002'
150 |     bytez = open(malicious_fn, 'rb').read()
151 | 
152 |     modpe = PEFileModifier(bytez)
153 |     modpe.add_section(*sections[0])
154 |     modpe.append_overlay(overlays[0])
155 |     modpe.set_timestamp(timestamps[0])
156 |     modpe.add_imports(*imports[0])
157 |     bytez2 = modpe.content
158 |     print(len(bytez), len(bytez2))
159 | 


--------------------------------------------------------------------------------
/attacker/docs/API.md:
--------------------------------------------------------------------------------
 1 | # REST API
 2 | <!-- vscode-markdown-toc -->
 3 | * [Query Hosted ML models](#query-hosted-ml-models)
 4 | * [Upload ZIP files and check on status](#upload-zip-files-and-check-on-status)
 5 | 
 6 | <!-- vscode-markdown-toc-config
 7 | 	numbering=false
 8 | 	autoSave=true
 9 | 	/vscode-markdown-toc-config -->
10 | <!-- /vscode-markdown-toc -->
11 | 
12 | ## <a name='query-hosted-ml-models'></a>Query Hosted ML models
13 | Submit a sample to all hosted ML models, and retrieve a `jobid`
14 | * [ml_submit_sample_all](ml_submit_sample_all.md): `POST https://api.mlsec.io/api/ml_submit_sample_all?api_token={api_token}`
15 | 
16 | Submit a sample to one or more specific ML models, and retrieve a `jobid`
17 | * [ml_submit_sample](ml_submit_sample.md): `POST https://api.mlsec.io/api/ml_submit_sample?api_token={api_token}?model={model1,model2}`
18 | 
19 | Retrieve resuts from sample submission, referenced by `jobid`
20 | * [ml_get_sample](ml_get_sample.md): `GET https://api.mlsec.io/api/ml_get_sample?api_token={api_token}&jobid={jobid}`
21 | 
22 | ## <a name='upload-zip-files-and-check-on-status'></a>Upload ZIP files and check on status
23 | **Rather than using these API routes, you may submit and view the status of your submission at [https://mlsec.io/zipfile](https://mlsec.io/zipfile/).**
24 | 
25 | Upload a ZIP file containing samples
26 | * [post_one_zip](post_one_zip.md): `POST https://api.mlsec.io/api/post_one_zip/new/?url=%2Fzipfile%2F&api_token={api_token}`
27 | 
28 | Query specific ZIP status
29 | * [get_one_zip](get_one_zip.md): `GET https://api.mlsec.io/api/get_one_zip/<ID>?api_token={api_token}`
30 | 
31 | It may take several minutes for the status to show that the ZIP is ready.  Each sample must be submitted to each ML model (which counts against your API count on the leaderboard).  Those samples that evade all ML models are subsequently detonated in a sandbox to verify functionality of the original sample.  
32 | 
33 | Query status of all samples
34 | * [get_all_sample](get_all_sample.md): `GET https://api.mlsec.io/api/get_all_sample/?api_token={api_token}`
35 | 
36 | Query status of a specific sample
37 | * [get_one_sample](get_one_sample.md): `GET https://api.mlsec.io/api/get_one_sample/<ID>?api_token={api_token}`


--------------------------------------------------------------------------------
/attacker/docs/get_all_sample.md:
--------------------------------------------------------------------------------
 1 | # get_all_sample
 2 | Query the status of uploaded samples
 3 | 
 4 | **METHOD**: `GET`
 5 | 
 6 | **URL**: `https://api.mlsec.io/api/get_all_sample/`
 7 | 
 8 | **PARAMETERS**: 
 9 | * `api_token`: obtained from [https://mlsec.io/myuser](https://mlsec.io/myuser/)
10 | 
11 | ## Successful response
12 | **Code**: `200 OK`
13 | 
14 | ## Example
15 | `curl -X GET https://api.mlsec.io/api/get_all_sample/?api_token=0123456789abcdef0123456789abcdef`
16 | 
17 | ```
18 | [ 
19 |     {
20 |         "errors": "ML process error.",
21 |         "id": 36,
22 |         "ioc": "",
23 |         "ioc_processed": "False",
24 |         "ml1_bypassed": "False",
25 |         "ml2_bypassed": "False",
26 |         "ml3_bypassed": "False",
27 |         "ml_job_id": "0123456789abcdef",
28 |         "ml_processed": "False",
29 |         "name": 50,
30 |         "sample_score": 0,
31 |         "sandbox_equivalent": "False",
32 |         "sbx_job_id": 0,
33 |         "sbx_sample_id": 0,
34 |         "sbx_submission_id": 0,
35 |         "sha256": "0123456789abcdef",
36 |         "uploader_id": {
37 |             "errors": "None",
38 |             "id": 1,
39 |             "name": "name",
40 |             "path": "Jane.Doe_AT_microsoft.com_1596580794.7931645.zip",
41 |             "process_status": "processing_sandbox",
42 |             "uploader_id": 6327,
43 |             "uploadtime": "2020-08-04 22:39"
44 |         },
45 |         "valid_new_PE": "True"
46 |     }
47 | ]
48 | ```
49 | 
50 | [Back to API](API.md)


--------------------------------------------------------------------------------
/attacker/docs/get_one_sample.md:
--------------------------------------------------------------------------------
 1 | # get_one_sample
 2 | Query the status of a specific sample
 3 | 
 4 | **METHOD**: `GET`
 5 | 
 6 | **URL**: `https://api.mlsec.io/api/get_one_sample/<ID>`
 7 | 
 8 | **PARAMETERS**: 
 9 | * `api_token`: obtained from [https://mlsec.io/myuser](https://mlsec.io/myuser/)
10 | 
11 | ## Successful response
12 | **Code**: `200 OK`
13 | 
14 | ## Example
15 | `curl -X GET https://api.mlsec.io/api/get_one_sample/1?api_token=0123456789abcdef0123456789abcdef`
16 | 
17 | 
18 | [Back to API](API.md)


--------------------------------------------------------------------------------
/attacker/docs/get_one_zip.md:
--------------------------------------------------------------------------------
 1 | # get_one_zip
 2 | Query the status of ZIP file
 3 | 
 4 | **METHOD**: `GET`
 5 | 
 6 | **URL**: `https://api.mlsec.io/api/get_one_zip/<ID>`
 7 | 
 8 | **PARAMETERS**: 
 9 | * `<ID>`: obtained from `r["uploader_id"]["id"]` from [get_all_sample](get_all_sample.md)
10 | * `api_token`: obtained from [https://mlsec.io/myuser](https://mlsec.io/myuser/)
11 | 
12 | ## Successful response
13 | **Code**: `200 OK`
14 | 
15 | ## Example
16 | `curl -X GET https://api.mlsec.io/api/get_one_zip/1?api_token=0123456789abcdef0123456789abcdef`
17 | 
18 | ```
19 | {"errors":null,"id":1,"name":"name","path":"Jane.Doe_AT_microsoft.com_1596580794.7931645.zip","process_status":"processing_sandbox","uploader_id":6327,"uploadtime":"2020-08-04 22:39"}
20 | ```
21 | 
22 | [Back to API](API.md)


--------------------------------------------------------------------------------
/attacker/docs/ml_get_sample.md:
--------------------------------------------------------------------------------
 1 | # ml_get_sample
 2 | Retrieve resuts from sample submission, referenced by `jobid`
 3 | 
 4 | **METHOD**: `GET`
 5 | 
 6 | **URL**: `https://api.mlsec.io/api/get_ml_sample`
 7 | 
 8 | **PARAMETERS**: 
 9 | * `api_token`: obtained from [https://mlsec.io/myuser](https://mlsec.io/myuser/)
10 | * `jobid`: `jobid` returned by a call to [ml_submit_sample](ml_submit_sample.md) or [ml_submit_sample_all](ml_submit_sample_all.md)
11 | 
12 | ## Successful response
13 | **Code**
14 | * `200 OK`.  
15 | * `202` if result is not yet ready
16 | 
17 | **Content**
18 | ```json
19 | {
20 |     "model1": {
21 |         "result": 0
22 |     },
23 |     "model2": {
24 |         "result": 1
25 |     }
26 | }
27 | ```
28 | 
29 | 
30 | ## Example
31 | `curl -X GET "https://api.mlsec.io/api/ml_get_sample?api_token=0123456789abcdef0123456789abcdefb&jobid=736330aaa3a4683d3cc866153510763351a60062a236d22b12f4fe0f10853582"`
32 | 
33 | ```json
34 | {
35 |     "ember": {
36 |         "result": 0
37 |     }
38 | }
39 | ```
40 | 
41 | [Back to API](API.md)


--------------------------------------------------------------------------------
/attacker/docs/ml_submit_sample.md:
--------------------------------------------------------------------------------
 1 | # ml_submit_sample
 2 | Submit a sample to one or more specific ML models, and retrieve a `jobid`
 3 | 
 4 | **METHOD**: `POST`
 5 | 
 6 | **URL**: `https://api.mlsec.io/api/ml_submit_sample`
 7 | 
 8 | **PARAMETERS**: 
 9 | * `api_token`: obtained from [https://mlsec.io/myuser](https://mlsec.io/myuser/)
10 | * `model`: a comma-separated list of model to query, e.g., `model=ember,needforspeed,domumpqb`
11 | 
12 | ## Successful response
13 | **Code**: `200 OK`
14 | 
15 | **Content**
16 | ```json
17 | {
18 |     "jobid": "{jobid}"
19 | }
20 | ```
21 | 
22 | ## Example
23 | `curl -X POST "https://api.mlsec.io/api/ml_submit_sample?api_token=0123456789abcdef0123456789abcdef&model=ember" --data-binary @putty.exe`
24 | 
25 | ```json
26 | {
27 |     "jobid": "db9616e0254b9187a42711fe2a68b6e29657b5d73aa3394392384b6354132dd1"
28 | }
29 | ```
30 | 
31 | [Back to API](API.md)


--------------------------------------------------------------------------------
/attacker/docs/ml_submit_sample_all.md:
--------------------------------------------------------------------------------
 1 | # ml_submit_sample_all
 2 | Submit a sample to all hosted ML models, and retrieve a `jobid`
 3 | 
 4 | **METHOD**: `POST`
 5 | 
 6 | **URL**: `https://api.mlsec.io/api/ml_submit_sample_all`
 7 | 
 8 | **PARAMETERS**: 
 9 | * `api_token`, obtained from [https://mlsec.io/myuser](https://mlsec.io/myuser/).
10 | 
11 | ## Successful response
12 | **Code**: `200 OK`
13 | 
14 | **Content**
15 | ```json
16 | {
17 |     "jobid": "{jobid}"
18 | }
19 | ```
20 | 
21 | ## Example
22 | `curl -X POST https://api.mlsec.io/api/ml_submit_sample_all?api_token=0123456789abcdef0123456789abcdef --data-binary @putty.exe`
23 | 
24 | ```json
25 | {
26 |     "jobid": "db9616e0254b9187a42711fe2a68b6e29657b5d73aa3394392384b6354132dd1"
27 | }
28 | ```
29 | 
30 | 
31 | [Back to API](API.md)


--------------------------------------------------------------------------------
/attacker/docs/post_one_zip.md:
--------------------------------------------------------------------------------
 1 | # post_one_zip
 2 | Upload a ZIP file containing samples tp be evaluated. Note that only one ZIP file may be uploaded every 60 minutes. The user interface at [https://mlsec.io/zipfile/](https://mlsec.io/zipfile/new/?url=%2Fzipfile%2F) may be used in lieu of this API.
 3 | 
 4 | **METHOD**: `POST`
 5 | 
 6 | **URL**: `https://api.mlsec.io/api/post_one_zip/new/`
 7 | 
 8 | **PARAMETERS**: 
 9 | * `api_token`: obtained from [https://mlsec.io/myuser](https://mlsec.io/myuser/)
10 | * `name`: a custom label for your ZIP
11 | * `path`: local path of ZIP file to upload
12 | 
13 | ## Successful response
14 | **Code**
15 | * `200 OK`
16 | * Other: note hat only one ZIP file may be uploaded every 60 minutes
17 | 
18 | ## Example
19 | `curl -X POST "https://api.mlsec.io/api/post_one_zip/new/?url=%2Fzipfile%2F&api_token=0123456789abcdef0123456789abcdef" --form "name=my_label" --form path=\@test_mlsc.zip`
20 | 
21 | This API use using a web form.  The HTML output should be ignored.
22 | 
23 | [Back to API](API.md)


--------------------------------------------------------------------------------
/attacker/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | lightgbm
3 | lief
4 | hyperopt
5 | annoy
6 | click
7 | requests
8 | -e git+https://github.com/endgameinc/ember.git#egg=ember


--------------------------------------------------------------------------------
/defender/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-slim
 2 | 
 3 | #############################
 4 | # INSTALL PYTHON DEPENDENCIES
 5 | #############################
 6 | 
 7 | # install git for pip install git+https://
 8 | RUN apt-get -o Acquire::Max-FutureTime=100000 update \
 9 |  && apt-get install -y --no-install-recommends build-essential git
10 | 
11 | # create a virtual environment
12 | RUN python -m venv /opt/venv
13 | ENV PATH="/opt/venv/bin:$PATH"
14 | 
15 | # copy and install python requirements + ember from github
16 | COPY docker-requirements.txt .
17 | RUN pip install --no-cache-dir -r docker-requirements.txt \
18 |  && pip install --no-cache-dir git+https://github.com/endgameinc/ember.git
19 | 
20 | #############################
21 | # REBASE & DEPLOY CODE
22 | #############################
23 | 
24 | # rebase to make a smaller image
25 | FROM python:3.7-slim
26 | 
27 | # required libgomp1 for ember
28 | RUN apt-get -o Acquire::Max-FutureTime=100000 update \
29 |     && apt-get -y --no-install-recommends install \
30 |         libgomp1 \
31 |     && rm -rf /var/lib/apt/lists/*
32 | 
33 | # copy python virtual env (all dependencies) from previous image
34 | COPY --from=0 /opt/venv /opt/venv
35 | 
36 | # copy defender code to /opt/defender/defender
37 | COPY defender /opt/defender/defender
38 | 
39 | #############################
40 | # SETUP ENVIRONMENT
41 | #############################
42 | 
43 | # open port 8080
44 | EXPOSE 8080
45 | 
46 | # add a defender user and switch user
47 | RUN groupadd -r defender && useradd --no-log-init -r -g defender defender
48 | USER defender
49 | 
50 | # change working directory
51 | WORKDIR /opt/defender/
52 | 
53 | # update environmental variables
54 | ENV PATH="/opt/venv/bin:$PATH"
55 | ENV PYTHONPATH="/opt/defender"
56 | 
57 | # one may tune model file / threshold / name via environmental variables
58 | ENV DF_MODEL_GZ_PATH models/ember_model.txt.gz
59 | ENV DF_MODEL_THRESH 0.8336
60 | ENV DF_MODEL_NAME ember
61 | ENV DF_MODEL_BALL_THRESH 0.25
62 | ENV DF_MODEL_HISTORY 10000
63 | 
64 | #############################
65 | # RUN CODE
66 | #############################
67 | CMD ["python","-m","defender"]
68 | 
69 | ## TO BUILD IMAGE:
70 | # docker build -t ember .
71 | ## TO RUN IMAGE (ENVIRONMENTAL VARIABLES DECLARED ABOVE)
72 | # docker run -itp 8080:8080 ember
73 | ## TO RUN IMAGE (OVERRIDE ENVIRONMENTAL VARIABLES DECLARED ABOVE)
74 | # docker run -itp 8080:8080 --env DF_MODEL_GZ_PATH="models/ember_model.txt.gz" --env DF_MODEL_THRESH=0.8336 --env DF_MODEL_NAME=myember ember
75 | 


--------------------------------------------------------------------------------
/defender/FAQ.md:
--------------------------------------------------------------------------------
 1 | # Frequently Asked Questions
 2 | 
 3 | <!-- vscode-markdown-toc -->
 4 | * [The example solution](#the-example-solution)
 5 |     * [What does the example solution do?](#what-does-the-example-solution-do?)
 6 | * [Data sources](#data-sources)
 7 |     * [Where do I obtain training samples for my solution?](#where-do-i-obtain-training-samples-for-my-solution?)
 8 |     * [Where do I obtain samples to test or validate my solution?](#where-do-i-obtain-samples-to-test-or-validate-my-solution?)
 9 | * [Building a solution](#building-a-solution)
10 |     * [The example DockerFile fails to build with `Release file for http://security.debian.org/debian-security/dists/buster/updates/InRelease is not valid yet`](#the-example-dockerfile-fails-to-build-with-`release-file-for-http://security.debian.org/debian-security/dists/buster/updates/inrelease-is-not-valid-yet`)
11 |     * [How can I get my Docker image under 1 GB?](#how-can-i-get-my-docker-image-under-1-gb?)
12 | * [Uploading and validating a solution](#uploading-and-validating-a-solution)
13 |     * [Why was my solution rejected on upload?](#why-was-my-solution-rejected-on-upload?)
14 |     * [What are the specs of the hosted docker container?](#what-are-the-specs-of-the-hosted-docker-container?)
15 | 
16 | <!-- vscode-markdown-toc-config
17 | 	numbering=false
18 | 	autoSave=true
19 | 	/vscode-markdown-toc-config -->
20 | <!-- /vscode-markdown-toc -->
21 | 
22 | 
23 | ## <a name='the-example-solution'></a>The example solution
24 | 
25 | ### <a name='what-does-the-example-solution-do?'></a>What does the example solution do?
26 | The sample solution consists of the [EMBER model](https://github.com/endgameinc/ember), wrapped inside a stateful nearest-neighbor detector inspired by the method presented by [(Chen, et al. 2019)](https://arxiv.org/abs/1907.05587). A sample that the EMBER model scores as benign, but has a malicious nearest neighbor in the query history is considered an adversarial evasion attempt.  Since common evasion attacks include adding new sections, appending to sections, or appending to the overlay, in this implementation, nearest neighbors are computed with respect to byte-level features (histogram and byte-entropy features) derived from "stripped down" versions of the submitted binary. Each submitted binary is reduced to (up to) the first five sections, only (up to) the first 64k of each section is retained, and only (up to) the first 128 bytes of the overlay are retained. The nearest neighbor radius was set to achieve a small FP rate on binaries in `C:\Windows\System32\`, while still detecting a large fraction of evasive variants submitted in the 2019 competition.
27 | 
28 | The sample solution has not been extensively tuned.
29 | 
30 | ## <a name='data-sources'></a>Data sources
31 | 
32 | ### <a name='where-do-i-obtain-training-samples-for-my-solution?'></a>Where do I obtain training samples for my solution?
33 | For convenience, it is recommended that you modify the example solution based on the [EMBER model](https://github.com/endgameinc/ember), for which [pre-computed features may be downloaded](https://github.com/endgameinc/ember#download) for benign and malicious files. This circumvents legal restrictions around sharing copyrighted benign files, especially.
34 | 
35 | Should you wish to train your own model from scratch, you are responsible for curating your own dataset. Unfortunately, we are unable to provide large collections of benign or malicious samples at this time.
36 | 
37 | ### <a name='where-do-i-obtain-samples-to-test-or-validate-my-solution?'></a>Where do I obtain samples to test or validate my solution?
38 | Malicious samples and evasive variants from the 2019 competition (`MLSEC_2019_samples_and_variants.zip`) may be downloaded from [https://mlsec.io/](https://mlsec.io) after registering or logging in.  **It is not required to unzip and strongly recommended that you do not unzip the archive to test malicious samples.** 
39 | 
40 | View the README file contained in `MLSEC_2019_samples_and_variants.zip` to understand how the contents are organized. Do this without extracting the full contents via
41 | ```
42 | unzip -P infected -p MLSEC_2019_samples_and_variants.zip MLSEC_2019_samples_and_variants/README | less
43 | ```
44 | 
45 | ## <a name='building-a-solution'></a>Building a solution
46 | 
47 | ### <a name='the-example-dockerfile-fails-to-build-with-`release-file-for-http://security.debian.org/debian-security/dists/buster/updates/inrelease-is-not-valid-yet`'></a>The example DockerFile fails to build with `Release file for http://security.debian.org/debian-security/dists/buster/updates/InRelease is not valid yet`
48 | This is usually caused by the Docker container clock being out of sync with the host clock.  Things you can try to fix this include:
49 | * Restart the Docker (Docker Desktop on Windows) service
50 | * If this doesn't work, modify every instance of `apt-get update` in your DockerFile with `-o Acquire::Max-FutureTime=86400`, and specify enough time (in seconds) to make the request valid. 86400 seconds is a single day.
51 | 
52 | ### <a name='how-can-i-get-my-docker-image-under-1-gb?'></a>How can I get my Docker image under 1 GB?
53 | It is strongly recommended that you lightly modify the example Dockerfile rather than writing your own.  Use 
54 | ```
55 | docker system df -v
56 | ```
57 | to view the size of your docker image after you've built it.  If your image exceeds 1 GB, check to make sure you're practicing good Dockerfile hygiene.
58 | 
59 | **We've found that the biggest waste of space are unused Python packages.  Be selective in what you `pip install`.**
60 | 
61 | There are a number of additional tricks that may help you reduce the size of your image that are implemented in the example Dockerfile.
62 | * Use multiple commands per `RUN`.  Each `RUN` creates an additional image, which adds to space.  So, `RUN do_thing1 && do_thing2 && do_thing3` is more space efficient than three separate `RUN` commands.
63 | * Use `--no-cache-dir` when doing `pip install`.
64 | * Start with a small image in `FROM`. Beware of using base images that [may cause Python crashes](https://pythonspeed.com/articles/alpine-docker-python/).
65 | 
66 | ## <a name='uploading-and-validating-a-solution'></a>Uploading and validating a solution
67 | 
68 | ### <a name='why-was-my-solution-rejected-on-upload?'></a>Why was my solution rejected on upload?
69 | A few things to check:
70 | * Docker images are tar files. We accept `.tar.gz` files.  Did you `gzip` your Docker image?
71 | * Is your Docker image (_before gzip_) less than 1 GB?
72 | 
73 | ### <a name='what-are-the-specs-of-the-hosted-docker-container?'></a>What are the specs of the hosted docker container?
74 | Your hosted docker container will have a memory limit of 1.5G and a single CPU.  Testing it offline with
75 | ```docker run -itp 8080:8080 --memory=1.5g --cpus=1 mydefender```
76 | should scare out the memory/CPU bugs before you upload.
77 | 
78 | 


--------------------------------------------------------------------------------
/defender/README.md:
--------------------------------------------------------------------------------
 1 | # Defender Challenge
 2 | <!-- vscode-markdown-toc -->
 3 | * [Overview](#overview)
 4 |     * [Challenge dates](#challenge-dates)
 5 |     * [Rules / Terms](#rules-/-terms)
 6 |     * [Requirements](#requirements)
 7 | * [Build the sample solution](#build-the-sample-solution)
 8 | * [Modify the sample solution](#modify-the-sample-solution)
 9 | * [Frequently Asked Questions](#frequently-asked-questions)
10 | 
11 | <!-- vscode-markdown-toc-config
12 | 	numbering=false
13 | 	autoSave=true
14 | 	/vscode-markdown-toc-config -->
15 | <!-- /vscode-markdown-toc -->
16 | 
17 | ## <a name='overview'></a>Overview
18 | 
19 | ### <a name='challenge-dates'></a>Challenge dates
20 | Jun 15 – Jul 23, 2020 AoE (AoE)
21 | 
22 | ### <a name='rules-/-terms'></a>Rules / Terms
23 | [https://mlsec.io/tos](https://mlsec.io/tos)
24 | 
25 | ### <a name='requirements'></a>Requirements
26 | A valid submission for the defense track consists of the following
27 | 1. a Docker image no larger than 1 GB when _uncompressed_ (`gzip` compression required for upload)
28 | 2. listens on port 8080
29 | 3. accepts `POST /` with header `Content-Type: application/octet-stream` and the contents of a PE file in the body
30 | 4. returns `{"result": 0}` for benign files and `{"result": 1}` for malicious files (bytes `POST`ed as `Content-Type: application/json`)
31 | 5. must exhibit a false positive rate of less than 1% and a false negative rate of less than 10% (checked on upload, during and after the [Attacker Challenge](../attacker/) using randomly-selected files)
32 | 6. for files up to 2**21 bytes (2 MiB), must respond in less than 5 seconds (a timeout results in a benign verdict)
33 | 
34 | ## <a name='build-the-sample-solution'></a>Build the sample solution
35 | Before you proceed, you must [install Docker Engine](https://docs.docker.com/engine/install/) for your operating system.
36 | 
37 | A sample solution that you may modify is included in the `defender` folder. (See the [FAQ](FAQ.md#the-example-solution) for an overview of the example solution.) 
38 | 
39 | Install Python requirements needed to test the solution:
40 | ```
41 | pip install -r requirements.txt
42 | ```
43 | 
44 | From the `defender` folder that contains the `Dockerfile`, build the solution:
45 | ```
46 | docker build -t ember .
47 | ```
48 | 
49 | Run the docker container:
50 | ```
51 | docker run -itp 8080:8080 ember
52 | ```
53 | (The flag `-p 8080:8080` maps the container's port 8080 to the host's port 8080.)
54 | 
55 | Test the solution on malicious and benign samples of your choosing via:
56 | ```
57 | python -m test -m MLSEC_2019_samples_and_variants.zip -b C:\Windows\System32\ 
58 | ```
59 | Sample collections may be in a folder, or in an archive of type `zip`, `tar`, `tar.bz2`, `tar.gz` or `tgz`.  `MLSEC_2019_samples_and_variants.zip` contains malware and evasive submissions from the 2019 evasion competition and may be downloaded from [https://mlsec.io/](https://mlsec.io) after registering or logging in.  **It is not required to unzip and strongly recommended that you do not unzip the archive to test malicious samples.** 
60 | 
61 | 
62 | ## <a name='modify-the-sample-solution'></a>Modify the sample solution
63 | A sure way to submit a valid solution is to modify the example Python code and Dockerfile. Do this as follows:
64 | 1. Modify [defender/models/ember_model.py](defender/models/ember_model.py) or create a a new model file in [defender/models](defender/models).
65 |     + Your Python class must include a `predict` method that [returns an integer](defender/defender/models/ember_model.py#L30-L32): `0` for benign and `1` for malicious.  (The code will appropriately wrap this result in a JSON response.)
66 | 2. In [defender/\_\_main\_\_.py](defender/__main__.py), [import your new model](defender/__main__.py#L5-L6), [instantiate your model](defender/__main__.py#L20-L25), and [include it in your app](defender/__main__.py#L27) via `app = create_app(model)`.
67 |     + Tip: you may choose to [pass some model parameters](defender/__main__.py#L10-L14) (e.g., model file, threshold) via environmental variables so that you can tune these in the Dockerfile (faster builds!) rather than in the Python code.
68 | 3. Make sure to update [docker-requirements.txt](docker.requirements.txt) with any Python dependencies that you `import`ed when writing your code.
69 | 4. Modify the [Dockerfile](Dockerfile) to install any addiitonal binary dependencies.
70 | 5. Build your docker image using `docker build -t mydefender .` from the directory containing `DOCKERFILE`.  It is recommended that your registered username at [https://mlsec.io](https://mlsec.io) is consistent with the name of your docker image (i.e., change `mydefender` to your username).
71 | 6. Run your docker image using `docker run -itp 8080:8080 --memory=1.5g --cpus=1 mydefender`
72 |     + Your hosted docker container will have a memory limit of 1.5G and a single CPU
73 | 7. Test your solution using `python -m defender.test -m MLSEC_2019_samples_and_variants.zip -b C:\Windows\System32\`.  
74 |     + Malicious and benign samples may be contained in a folder, a ZIP (possibly encrypted with password `infected`), or a tarball (including `.gz` and `.bz2`).
75 | 8. If your image passes tests (FP/FN rates, etc.) in your offline tests (<1% FPR, <10% FPR), you are ready to upload it to the website.
76 |     + Export your docker image `docker image save -o mydefender.tar mydefender`.  Replace `mydefender` with your username.
77 |     + Ensure that your saved image `mydefender.tar` does not exceed 1 GB.
78 |     + [GZIP](https://www.gnu.org/software/gzip/) your tar image via `gzip mydefender.tar` to create `mydefender.tar.gz`.
79 |     + Login to the [website](https://mlsec.io) and upload `mydefender.tar.gz`.
80 |     + Take a break. Validating the docker image may take some time. Please allow 20 minutes before checking the status of your upload.  The web portal will indicate whether your image has passed validation tests.
81 | 
82 | ## <a name='frequently-asked-questions'></a>Frequently Asked Questions
83 | For additional questions, the following resources are available:
84 | * [Frequently Asked Questions](FAQ.md) markdown file with solutions to common problems
85 | * [Join the Slack channel](https://join.slack.com/t/evademalwareml/shared_invite/zt-9birv1qf-KJFEiyLLRVtrsNDuyA0clA) to interact with other contestants
86 | * [Submit an issue](https://github.com/Azure/2020-machine-learning-security-evasion-competition/issues) for issues relating to the sample code
87 | 


--------------------------------------------------------------------------------
/defender/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/2020-machine-learning-security-evasion-competition/bfe1fe9d666d8679e3a4b8797d6e2cba169fd138/defender/__init__.py


--------------------------------------------------------------------------------
/defender/defender/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/2020-machine-learning-security-evasion-competition/bfe1fe9d666d8679e3a4b8797d6e2cba169fd138/defender/defender/__init__.py


--------------------------------------------------------------------------------
/defender/defender/__main__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import envparse
 3 | from defender.apps import create_app
 4 | 
 5 | # CUSTOMIZE: import model to be used
 6 | from defender.models.ember_model import StatefulNNEmberModel
 7 | 
 8 | if __name__ == "__main__":
 9 |     # retrive config values from environment variables
10 |     model_gz_path = envparse.env("DF_MODEL_GZ_PATH", cast=str, default="models/ember_model.txt.gz")
11 |     model_thresh = envparse.env("DF_MODEL_THRESH", cast=float, default=0.8336)
12 |     model_name = envparse.env("DF_MODEL_NAME", cast=str, default="ember")
13 |     model_ball_thresh = envparse.env("DF_MODEL_BALL_THRESH", cast=float, default=0.25)
14 |     model_max_history = envparse.env("DF_MODEL_HISTORY", cast=int, default=10_000)
15 | 
16 |     # construct absolute path to ensure the correct model is loaded
17 |     if not model_gz_path.startswith(os.sep):
18 |         model_gz_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), model_gz_path)
19 | 
20 |     # CUSTOMIZE: app and model instance
21 |     model = StatefulNNEmberModel(model_gz_path,
22 |                                  model_thresh,
23 |                                  model_ball_thresh,
24 |                                  model_max_history,
25 |                                  model_name)
26 | 
27 |     app = create_app(model)
28 | 
29 |     import sys
30 |     port = int(sys.argv[1]) if len(sys.argv) == 2 else 8080
31 | 
32 |     from gevent.pywsgi import WSGIServer
33 |     http_server = WSGIServer(('', port), app)
34 |     http_server.serve_forever()
35 | 
36 |     # curl -XPOST --data-binary @somePEfile http://127.0.0.1:8080/ -H "Content-Type: application/octet-stream"
37 | 


--------------------------------------------------------------------------------
/defender/defender/apps.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, jsonify, request
 2 | 
 3 | 
 4 | def create_app(model):
 5 |     app = Flask(__name__)
 6 |     app.config['model'] = model
 7 | 
 8 |     # analyse a sample
 9 |     @app.route('/', methods=['POST'])
10 |     def post():
11 |         # curl -XPOST --data-binary @somePEfile http://127.0.0.1:8080/ -H "Content-Type: application/octet-stream"
12 |         if request.headers['Content-Type'] != 'application/octet-stream':
13 |             resp = jsonify({'error': 'expecting application/octet-stream'})
14 |             resp.status_code = 400  # Bad Request
15 |             return resp
16 | 
17 |         bytez = request.data
18 | 
19 |         model = app.config['model']
20 | 
21 |         # query the model
22 |         result = model.predict(bytez)
23 |         if not isinstance(result, int) or result not in {0, 1}:
24 |             resp = jsonify({'error': 'unexpected model result (not in [0,1])'})
25 |             resp.status_code = 500  # Internal Server Error
26 |             return resp
27 | 
28 |         resp = jsonify({'result': result})
29 |         resp.status_code = 200
30 |         return resp
31 | 
32 |     # get the model info
33 |     @app.route('/model', methods=['GET'])
34 |     def get_model():
35 |         # curl -XGET http://127.0.0.1:8080/model
36 |         resp = jsonify(app.config['model'].model_info())
37 |         resp.status_code = 200
38 |         return resp
39 | 
40 |     return app
41 | 


--------------------------------------------------------------------------------
/defender/defender/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/2020-machine-learning-security-evasion-competition/bfe1fe9d666d8679e3a4b8797d6e2cba169fd138/defender/defender/models/__init__.py


--------------------------------------------------------------------------------
/defender/defender/models/dummy_model.py:
--------------------------------------------------------------------------------
 1 | class DummyModel(object):
 2 |     def __init__(self, thresh: float = 0.1234, name: str = 'dummy'):
 3 |         self.model = None
 4 |         self.thresh = thresh
 5 |         self.__name__ = name
 6 | 
 7 |     def predict(self, bytez: bytes) -> int:
 8 |         return 1  # always predict malware (0 for benignware)
 9 | 
10 |     def model_info(self):
11 |         return {"thresh": self.thresh,
12 |                 "name": self.__name__}
13 | 


--------------------------------------------------------------------------------
/defender/defender/models/ember_model.py:
--------------------------------------------------------------------------------
  1 | from ember import PEFeatureExtractor
  2 | import lightgbm as lgb
  3 | import numpy as np
  4 | import gzip
  5 | from annoy import AnnoyIndex  # pip install --user annoy
  6 | from random import randint
  7 | import lief
  8 | import logging
  9 | 
 10 | logging.basicConfig(level=logging.DEBUG)
 11 | EMBER_MODEL_PATH = 'defender/models/ember_model.txt.gz'
 12 | 
 13 | 
 14 | class EmberModel(object):
 15 |     '''Implements predict(self, bytez)'''
 16 |     def __init__(self,
 17 |                  model_gz_path: str = EMBER_MODEL_PATH,
 18 |                  thresh: float = 0.8336,  # resulting in 1% FPR
 19 |                  name: str = 'ember'):
 20 |         # load lightgbm model
 21 |         with gzip.open(model_gz_path, 'rb') as f:
 22 |             model = f.read().decode('ascii')
 23 | 
 24 |         self.model_gz_path = model_gz_path
 25 |         self.model = lgb.Booster(model_str=model)
 26 |         self.thresh = thresh
 27 |         self.__name__ = name
 28 |         self.extractor = PEFeatureExtractor(2)  # feature_version=2
 29 | 
 30 |     def predict(self, bytez: bytes) -> int:
 31 |         score = self.predict_proba(bytez)
 32 |         return int(score > self.thresh)
 33 | 
 34 |     def predict_proba(self, bytez: bytes) -> float:
 35 |         self.features = np.array(self.extractor.feature_vector(bytez),
 36 |                                  dtype=np.float32)
 37 |         return self.model.predict([self.features])[0]
 38 | 
 39 |     def model_info(self) -> dict:
 40 |         return {"model_gz_path": self.model_gz_path,
 41 |                 "thresh": self.thresh,
 42 |                 "name": self.__name__}
 43 | 
 44 | 
 45 | class TrimPEFile(object):
 46 |     '''Trim a PE file from excessive sections, imports, overlay.  This removes content and
 47 |     most likely breaks the PE file format.  But, this doesn't matter to a defender.'''
 48 |     def __init__(self,
 49 |                  max_sections: int = 5,
 50 |                  max_section_size: int = 2**16,  # 64k
 51 |                  max_overlay: int = 128,
 52 |                  ):
 53 |         self.max_sections = max_sections
 54 |         self.max_section_size = max_section_size
 55 |         self.max_overlay = max_overlay
 56 | 
 57 |     def trim(self, bytez: bytes) -> bytes:
 58 |         # this operation may break the input file, but as a defender, we don't care
 59 |         try:
 60 |             pe = lief.parse(raw=bytez)
 61 |         except lief.read_out_of_bound:
 62 |             return bytez
 63 | 
 64 |         if not pe:
 65 |             return bytez
 66 | 
 67 |         # start assembling a new PE file
 68 |         new = lief.PE.Binary(pe.name, pe.optional_header.magic)  # preserve PE32 or PE32+ (64-bit) status
 69 | 
 70 |         # copy over the first several sections
 71 |         for i, s in enumerate(pe.sections):
 72 | 
 73 |             if i >= self.max_sections:
 74 |                 break
 75 | 
 76 |             if s.name.lower() == '.text':          #
 77 |                 typ = lief.PE.SECTION_TYPES.TEXT
 78 |             elif s.name.lower() == '.data' or s.name.lower() == '.rdata':
 79 |                 typ = lief.PE.SECTION_TYPES.DATA
 80 |             elif s.name.lower() == '.idata':        # import section
 81 |                 typ = lief.PE.SECTION_TYPES.IDATA
 82 |             elif s.name.lower() == '.edata':        # export section
 83 |                 typ = lief.PE.SECTION_TYPES.EXPORT
 84 |             elif s.name.lower() == '.bss':          # uninitialized data
 85 |                 typ = lief.PE.SECTION_TYPES.BSS
 86 |             elif s.name.lower() == '.rsrc':         # resources section
 87 |                 typ = lief.PE.SECTION_TYPES.RESOURCE
 88 |             elif s.name.lower() == '.reloc':
 89 |                 typ = lief.PE.SECTION_TYPES.RELOCATION
 90 |             elif s.name.lower() == '.tls':
 91 |                 typ = lief.PE.SECTION_TYPES.TLS_
 92 |             else:
 93 |                 typ = lief.PE.SECTION_TYPES.UNKNOWN
 94 |             s.content = s.content[:self.max_section_size]
 95 |             s.size = len(s.content)
 96 |             new.add_section(s, typ)
 97 | 
 98 |         # build the new PE file
 99 |         builder = lief.PE.Builder(new)
100 |         builder.build()
101 | 
102 |         newbytez = builder.get_build()
103 | 
104 |         if len(newbytez) == 0:
105 |             return bytez  # failed
106 | 
107 |         # copy over truncated overlay
108 |         overlay = pe.overlay
109 |         newbytez += overlay[:self.max_overlay]
110 | 
111 |         return bytes(newbytez)
112 | 
113 | 
114 | class StatefulNNEmberModel(EmberModel):
115 |     '''Adds stateful nearest-neighbor detection of adversarial examples to base ember model.
116 |     If a sample (or a trimmed variant) is deemed benign by EMBER, first check history of queries
117 |     for a sufficiently close malicious neighbor, and outputs "malicious" if one is found. Else, benign.
118 | 
119 |     Note that during the competition, the organizers will submit benign samples to the models, so care must be taken
120 |     so that the stateful history doesn't include any benign samples that could result in a high FP rate.
121 |     '''
122 |     ADV_INDEX_SIZE = 512  # grab the first ADV_INDEX_SIZE features, corresponding to histogram(256) and byteentropy(256)
123 |     # features are described by self.extractor.features:
124 |     #   [histogram(256), byteentropy(256), strings(104), general(10),
125 |     #    header(62), section(255), imports(1280), exports(128), datadirectories(30)]
126 | 
127 |     def __init__(self,
128 |                  model_gz_path: str = EMBER_MODEL_PATH,
129 |                  thresh: float = 0.8336,     # resulting in 1% FPR
130 |                  ball_thresh: float = 0.25,   # threshold for L1 distance to previously-seen malware
131 |                  max_history: int = 10_000,  # keep up to this much query history
132 |                  name: str = 'defended-ember'):
133 |         super().__init__(model_gz_path, thresh, name)
134 |         self.malicious_queries = []
135 |         self.max_history = max_history
136 |         self.ball_thresh = ball_thresh
137 |         self.trimmer = TrimPEFile()
138 | 
139 |     def predict(self, bytez: bytes) -> int:
140 |         score = self.predict_proba(bytez)
141 |         trimmed_bytez = self.trimmer.trim(bytez)
142 |         trimmed_score = self.predict_proba(trimmed_bytez)
143 |         trimmed_features = self.features
144 | 
145 |         # after predict_proba, self.features contains feature vector for bytez
146 |         # features are described by self.extractor.features:
147 |         #   [histogram(256), byteentropy(256), strings(104), general(10),
148 |         #    header(62), section(255), imports(1280), exports(128), datadirectories(30)]
149 |         # we'll use only the first 2 categories (512 columns) to index samples.
150 | 
151 |         if score > self.thresh or trimmed_score > self.thresh:
152 |             self.malicious_queries.append((trimmed_features[:self.ADV_INDEX_SIZE], score))
153 |             # if the list is too big, shuffle and trim (keep a random subset)
154 |             while len(self.malicious_queries) > self.max_history:
155 |                 # remove a random element
156 |                 self.malicious_queries.pop(index=randint(0, len(self.malicious_queries)))
157 | 
158 |         elif len(self.malicious_queries) > 0:
159 |             # is it sufficiently similar to some malicious sample I've seen previously?
160 |             t = AnnoyIndex(self.ADV_INDEX_SIZE, 'manhattan')
161 |             for i, (m, _) in enumerate(self.malicious_queries):
162 |                 t.add_item(i, m)
163 |             t.build(20)
164 | 
165 |             # is the core of the is file similar to a malicious file I've seen?
166 |             q = trimmed_features[:self.ADV_INDEX_SIZE]
167 |             nn_ix = t.get_nns_by_vector(q, 10)
168 | 
169 |             dists = [np.linalg.norm(self.malicious_queries[ix][0] - q, 1) for ix in nn_ix]
170 |             min_dist = min(dists)  # how close is the actual nearest neighbor?
171 | 
172 |             if min_dist < self.ball_thresh:
173 |                 logging.info("Detected Adversarial Example!")
174 |                 score = 1
175 | 
176 |         # else
177 |         result = int(score > self.thresh)
178 |         logging.info(f'result: {result}')
179 |         return result
180 | 
181 | 
182 | if __name__ == '__main__':
183 |     # to run this file, from the defender folder, run
184 |     # python -m defender.models.ember_model
185 | 
186 |     # let's determine a threshold for StatefulNNEmberModel.ball_thresh
187 |     # Do this by comparing NN distance of (2019 evasive variant, base malware sample) pairs 
188 |     # to the distance of (benign malware samples, base malware sample) pairs, across a large set of benign malware samples
189 |     #
190 |     # In what follows, I've assembled the samples in MLSEC_2019_samples_and_variants.zip into
191 |     # two separate archives:
192 |     #  /tmp/MLSEC_samples.zip  contains the 2019 base malware samples
193 |     #  /tmp/*.zip              each zip file contains 2019 evasive variants for a single user batch submission, with entries matching the base malware name
194 |     # For benign samples, I'm using a C:\Windows\System32 (Windows 10), which is mounted automatically via WSL2 (https://docs.microsoft.com/en-us/windows/wsl/install-win10) under
195 |     #  /mnt/c/windows/system32
196 | 
197 |     from test import file_bytes_generator
198 |     import glob
199 |     from collections import defaultdict
200 |     import os
201 | 
202 |     fe = PEFeatureExtractor(2)
203 |     ADV_INDEX_SIZE = 512  # grab the first ADV_INDEX_SIZE features, corresponding to histogram(256) and byteentropy(256)
204 |     # features are described by self.extractor.features:
205 |     #   [histogram(256), byteentropy(256), strings(104), general(10),
206 |     #    header(62), section(255), imports(1280), exports(128), datadirectories(30)]
207 | 
208 |     trimmer = TrimPEFile()
209 | 
210 |     print('examining base malware samples...')
211 |     samples = []
212 |     sample_names = []
213 |     for name, bytez in file_bytes_generator('/tmp/MLSEC_samples.zip', maxsize=2**21):
214 |         print(name)
215 |         fv = fe.feature_vector(trimmer.trim(bytez))
216 |         samples.append(fv[:ADV_INDEX_SIZE])
217 |         sample_names.append(os.path.basename(name))
218 | 
219 |     samples = np.array(samples)
220 | 
221 |     print('examining novel malware variants...')
222 |     malware_dist = defaultdict(list)
223 |     for z in glob.glob('/tmp/submissions/*.zip'):
224 |         print(z)
225 |         for name, bytez in file_bytes_generator(z, maxsize=2**21):
226 |             print(name)
227 |             basename = os.path.basename(name)
228 |             m_idx = sample_names.index(basename)
229 |             fv = fe.feature_vector(trimmer.trim(bytez))
230 |             q = np.array(fv[:ADV_INDEX_SIZE])
231 |             normalized_dist = np.sum(np.abs(samples[m_idx, :] - q))  # distance of evasive variant to original
232 |             malware_dist[basename].append((name, normalized_dist))
233 | 
234 |     print('examining benign samples...')
235 |     benign_dist = defaultdict(list)
236 |     for i, (name, bytez) in enumerate(file_bytes_generator('/mnt/c/windows/system32/', maxsize=2**21)):
237 |         if i > 1000:
238 |             break
239 |         print(name)
240 |         fv = fe.feature_vector(trimmer.trim(bytez))
241 |         q = np.array(fv[:ADV_INDEX_SIZE])
242 |         normalized_dist = np.sum(np.abs(samples - q), axis=1)  # distance from benign to every base malware sample
243 |         for d, n in zip(normalized_dist, sample_names):
244 |             benign_dist[n].append((name, d))
245 | 
246 |     # compute distance percentiles from benign/evasive samples to base malware samples
247 |     malware_stats = np.percentile([vv[1] for k, v in malware_dist.items() for vv in v], [10.0, 50.0, 90.0])
248 |     benign_stats = np.percentile([vv[1] for k, v in benign_dist.items() for vv in v], [0.1, 0.5, 1.0])
249 | 
250 |     print('distance percentiles for evasive variants (10%, 50%, 90%):')
251 |     print(malware_stats)
252 |     print('distance percentiles for benign samples (0.1%, 0.5%, 1%):')
253 |     print(benign_stats)
254 | 
255 |     # distance percentiles for evasive variants (10%, 50%, 90%):
256 |     # [2.92558077e-04 1.41343959e-02 2.23442249e+00]
257 |     # distance percentiles for benign samples (0.1%, 0.5%, 1%):
258 |     # [0.291772   0.40277199 0.49259179]
259 | 
260 |     # I'll set the StatefulNNEmberModel.ball_thresh = 0.25
261 |     # The decision is based purely on low FP rate, to prevent the stateful protection from drifting into a state
262 |     # wherein it labels EVERYTHING as an adversarial example.  This threshold should also catch a large fraction
263 |     # of the 2019 evasive variants, prodived that the stateful detection model first observes the base malware sample.
264 | 


--------------------------------------------------------------------------------
/defender/defender/models/ember_model.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/2020-machine-learning-security-evasion-competition/bfe1fe9d666d8679e3a4b8797d6e2cba169fd138/defender/defender/models/ember_model.txt.gz


--------------------------------------------------------------------------------
/defender/docker-requirements.txt:
--------------------------------------------------------------------------------
1 | # pip install git+https://github.com/endgameinc/ember.git
2 | flask>=1.1.2
3 | gevent>=1.4.0
4 | envparse
5 | annoy
6 | 


--------------------------------------------------------------------------------
/defender/requirements.txt:
--------------------------------------------------------------------------------
1 | # pip install git+https://github.com/endgameinc/ember.git
2 | flask>=1.1.2
3 | gevent>=1.4.0
4 | envparse
5 | annoy
6 | numpy
7 | tqdm
8 | 


--------------------------------------------------------------------------------
/defender/test/__init__.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | 
  3 | import tarfile
  4 | import zipfile
  5 | import pathlib
  6 | 
  7 | import time
  8 | import tqdm
  9 | import json
 10 | 
 11 | import numpy as np
 12 | import os
 13 | 
 14 | MAXFILESIZE = 2**21  # 2 MiB
 15 | TIMEOUT = 5
 16 | ZIP_PASSWORDS = [b'', b'infected']
 17 | 
 18 | # TINY PE FILES
 19 | MZHEADER = b'MZ'
 20 | 
 21 | TINYPE97 = (b'MZ\x00\x00PE\x00\x00L\x01\x01\x00j*X\xc3\x00\x00\x00\x00\x00\x00\x00\x00'
 22 |             b'\x04\x00\x03\x01\x0b\x01\x08\x00\x04\x00\x00\x00\x00\x00\x00\x00\x04\x00'
 23 |             b'\x00\x00\x0c\x00\x00\x00\x04\x00\x00\x00\x0c\x00\x00\x00\x00\x00@\x00\x04'
 24 |             b'\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00'
 25 |             b'\x00\x00\x00\x00\x00h\x00\x00\x00d\x00\x00\x00\x00\x00\x00\x00\x02')
 26 | 
 27 | TINYIMPORT = (b'MZ\x00\x00PE\x00\x00L\x01\x01\x00j*X\xc3\x00\x00\x00\x00\x00\x00\x00\x00'
 28 |               b'\x04\x00\x03\x01\x0b\x01\x08\x00\x98\x00\x00\x00\x00\x00\x00\x00\x95\x00'
 29 |               b'\x00\x00\x0c\x00\x00\x00\x95\x00\x00\x00\x0c\x00\x00\x00\x00\x00@\x00\x04'
 30 |               b'\x00\x00\x00\x04\x00\x00\x00\x94\x00\x00\x00\x8c\x00\x00\x00\x04\x00\x00'
 31 |               b'\x00\x00\x00\x00\x00$\x01\x00\x00\x8c\x00\x00\x00\x00\x00\x00\x00\x02\x00'
 32 |               b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
 33 |               b'\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x008\x00\x00'
 34 |               b'\x00(\x00\x00\x00\x01\x00\x00\x80\x00\x00\x00\x00KERNEL32.dll\x00')
 35 | 
 36 | 
 37 | def file_bytes_generator(location, maxsize, return_filename=True):
 38 |     # yields (name,bytes) for files smaller than `maxsize` and that begin with b'MZ'
 39 |     # works for tar (including bz2, gz), zip, and directories
 40 |     path = pathlib.Path(location)
 41 |     if path.is_file():
 42 |         if location.lower().endswith('.zip'):
 43 |             pwd_ix = 0
 44 |             with zipfile.ZipFile(location, 'r') as f:
 45 |                 for info in f.infolist():
 46 |                     if info.file_size <= maxsize:
 47 |                         while True:
 48 |                             try:
 49 |                                 content = f.read(
 50 |                                     info.filename, pwd=ZIP_PASSWORDS[pwd_ix])
 51 |                             except RuntimeError:
 52 |                                 pwd_ix += 1
 53 |                                 if pwd_ix >= len(ZIP_PASSWORDS):
 54 |                                     raise Exception(
 55 |                                         f"Unable to guess ZIP encryption passwords for {location}")
 56 |                             else:
 57 |                                 break
 58 | 
 59 |                         if content.startswith(b'MZ'):
 60 |                             yield (os.path.join(location, info.filename), content) if return_filename else content
 61 | 
 62 |         elif location.lower().endswith('.tar') or location.lower().endswith('.tar.bz2') or location.lower().endswith('.tar.gz') or location.lower().endswith('.tgz'):
 63 |             with tarfile.open(location, mode='r') as tar:
 64 |                 for member in tar:
 65 |                     if member.size <= maxsize:
 66 |                         f = tar.extractfile(member)
 67 |                         if f:
 68 |                             content = f.read()
 69 |                             if content.startswith(b'MZ'):
 70 |                                 yield (os.path.join(location, member.name), content) if return_filename else content
 71 | 
 72 |     elif path.is_dir():
 73 |         for filepath in path.glob('*'):
 74 |             fileobj = pathlib.Path(filepath)
 75 |             if fileobj.is_file() and fileobj.stat().st_size <= maxsize:
 76 |                 try:
 77 |                     with open(filepath, 'rb') as infile:
 78 |                         content = infile.read()
 79 |                         if content.startswith(b'MZ'):
 80 |                             yield (fileobj.absolute().name, content) if return_filename else content
 81 |                 except PermissionError:
 82 |                     continue
 83 | 
 84 | 
 85 | def get_raw_result(bytez, url, timeout):
 86 |     return requests.post(url, data=bytez, headers={'Content-Type': 'application/octet-stream'}, timeout=timeout)
 87 | 
 88 | 
 89 | def get_result(bytez, url, timeout, raise_exception=False):
 90 |     error_msg = None
 91 |     res = None
 92 |     start = time.time()
 93 |     try:
 94 |         res = get_raw_result(bytez, url, timeout)
 95 |         result = res.json()['result']
 96 |     except (requests.RequestException, KeyError, json.decoder.JSONDecodeError) as e:
 97 |         result = 0  # timeout or other error results in benign
 98 |         error_msg = str(e)
 99 |         if res:
100 |             error_msg += f'-{res.text()}'
101 |         if raise_exception:
102 |             raise(e)
103 | 
104 |     elapsed = time.time() - start
105 | 
106 |     return result, elapsed, error_msg
107 | 
108 | 
109 | def measure_efficacy(benignfiles, maliciousfiles, url, maxfilesize, timeout, silent=False, stop_after=None, raise_exception=False):
110 |     y_true = []
111 |     y_pred = []
112 |     elapsed = []
113 |     error = []
114 |     fps = []
115 |     fns = []
116 |     errors = []
117 | 
118 |     for i, (fname, bytez) in tqdm.tqdm(enumerate(file_bytes_generator(maliciousfiles, maxfilesize)), desc="malicious", disable=silent):
119 |         if stop_after and i >= stop_after:
120 |             break
121 |         y_true.append(1)
122 |         y, t, e = get_result(bytez, url, timeout, raise_exception)
123 |         y_pred.append(y)
124 |         elapsed.append(t)
125 |         error.append(0 if e is None else 1)
126 |         if e:
127 |             errors.append((fname, e))
128 |         if y != 1:
129 |             fns.append(fname)
130 | 
131 |     for i, (fname, bytez) in tqdm.tqdm(enumerate(file_bytes_generator(benignfiles, maxfilesize)), desc="benign", disable=silent):
132 |         if stop_after and i >= stop_after:
133 |             break
134 |         y_true.append(0)
135 |         y, t, e = get_result(bytez, url, timeout, raise_exception)
136 |         y_pred.append(y)
137 |         elapsed.append(t)
138 |         error.append(0 if e is None else 1)
139 |         if e:
140 |             errors.append((fname, e))
141 |         if y != 0:
142 |             fps.append(fname)
143 | 
144 |     y_true = np.array(y_true)
145 |     y_pred = np.array(y_pred)
146 |     elapsed = np.array(elapsed)
147 |     error = np.array(error)
148 | 
149 |     summary = {
150 |         'tested': len(y_true),
151 |         'malicious': int(np.sum(y_true == 1)),
152 |         'benign': int(np.sum(y_true == 0)),
153 |         'fp': float(len(fps) / np.sum(y_true == 0)),
154 |         'fn': float(len(fns) / np.sum(y_true == 1)),
155 |         'errors': int(sum(error)),  # includes timeouts and other errors
156 |         'max_time': float(elapsed.max()),
157 |         'avg_time': float(np.mean(elapsed))
158 |     }
159 | 
160 |     return summary, fps, fns, errors
161 | 
162 | 
163 | def informational(url, timeout):
164 |     def get_json_string_result_for(bytez):
165 |         res = get_raw_result(bytez, url, timeout)
166 |         return json.dumps(res.json())
167 | 
168 |     result = f'''
169 |     Preliminary tests:
170 |     \tMZ header: {get_json_string_result_for(MZHEADER)}
171 |     \tTiny PE FILE with no imports: {get_json_string_result_for(TINYPE97)}
172 |     \tTiny PE FILE with import: {get_json_string_result_for(TINYIMPORT)}'''
173 | 
174 |     return result
175 | 
176 | 
177 | if __name__ == '__main__':
178 |     from collections import defaultdict
179 |     usernum = defaultdict(lambda: len(usernum))
180 |     import zipfile
181 |     import glob
182 |     for zfn in sorted(glob.glob('zip/*.zip')):
183 |         with zipfile.ZipFile(zfn, 'r') as f:
184 |             for info in f.infolist():
185 |                 if info.file_size <= 2**21:
186 |                     content = f.read(info.filename)
187 | 
188 |                     if not content.startswith(b'MZ'):
189 |                         break
190 | 
191 |                     # create a sensible output filename:
192 |                     # 023_u0_ts0
193 |                     base = zfn[:-len('.zip.filtered.zip')]
194 |                     ix = base.rfind('_')
195 |                     ts, num = base[ix + 1:].split('.')
196 |                     user = base[:ix]
197 |                     num = num.zfill(3)
198 | 
199 |                     base = info.filename
200 |                     if base not in {str(n).zfill(3) for n in range(1, 50)}:
201 |                         break
202 | 
203 |                     user = str(usernum[user]).zfill(3)
204 | 
205 |                     outname = f'{base}_u{user}_s{num}'
206 |                     print(f'{zfn}/{info.filename} -> {outname}')
207 |                     with open(outname,'wb') as outf:
208 |                         outf.write(content)
209 |                     


--------------------------------------------------------------------------------
/defender/test/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from test import informational, measure_efficacy, MAXFILESIZE, TIMEOUT, TINYIMPORT, get_raw_result
 3 | import json
 4 | import pathlib
 5 | import requests
 6 | 
 7 | 
 8 | def main():
 9 |     parser = argparse.ArgumentParser(
10 |         description="Test defense ML docker image", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
11 |     parser.add_argument(
12 |         '-m', required=True, type=str, help='folder or archize (ZIP or TAR.GZ or TAR.BZ2) containing malicious samples')
13 |     parser.add_argument(
14 |         '-b', required=True, type=str, help='folder or archive (ZIP or TAR.GZ or TAR.BZ2) containing benign samples')
15 |     parser.add_argument('--url', type=str, default='http://127.0.0.1:8080/',
16 |                         help='URL address of ML detection service')
17 |     parser.add_argument('--max', type=int, default=MAXFILESIZE,
18 |                         help="maximum file size to read from folder/archive")
19 |     parser.add_argument('--stopafter', type=int, default=5000,
20 |                         help="test up to this many files in each folder/archive")
21 |     parser.add_argument('--timeout', type=int,
22 |                         default=TIMEOUT, help="timeout for requests")
23 |     args = parser.parse_args()
24 | 
25 |     assert pathlib.Path(args.b).is_file() or pathlib.Path(
26 |         args.b).is_dir(), "benign samples path does not exist"
27 |     assert pathlib.Path(args.m).is_file() or pathlib.Path(
28 |         args.m).is_dir(), "malicious samples path does not exist"
29 | 
30 |     # check the format of responses
31 |     try:
32 |         res = get_raw_result(TINYIMPORT, args.url, args.timeout)
33 |         json_result = res.json()
34 |     except requests.RequestException:
35 |         print(f'Error calling service at {args.url}')
36 |         return
37 |     except json.decoder.JSONDecodeError:
38 |         print(f'Expected json response, but received "{res.text}"')
39 |         return
40 | 
41 |     assert 'result' in json_result, f'Expected JSON with "result" key, but received {json.dumps(json_result)}'
42 | 
43 |     # check that the response is an integer
44 |     assert isinstance(json_result['result'], int), f'Expected an integer response but received {json.dumps(json_result)}'
45 | 
46 |     # provide information on some benign corner cases
47 |     print(informational(args.url, args.timeout))
48 | 
49 |     # efficacy test: malicious files are tested first, which helps measure FP rate for stateful defenses
50 |     summary, fps, fns, errors = measure_efficacy(
51 |         args.b, args.m, args.url, args.max, args.timeout, stop_after=args.stopafter)
52 | 
53 |     with open('fps.txt', 'w') as outfile:
54 |         for fp in fps:
55 |             outfile.write(f'{fp}\n')
56 | 
57 |     with open('fns.txt', 'w') as outfile:
58 |         for fn in fns:
59 |             outfile.write(f'{fn}\n')
60 | 
61 |     with open('errors.txt', 'w') as outfile:
62 |         for fn, e in errors:
63 |             outfile.write(f'{fn}\t{e}\n')
64 | 
65 |     print('Summary:')
66 |     print(json.dumps(summary, indent=2))
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     main()
71 | 


--------------------------------------------------------------------------------