├── db ├── .gitkeep └── rl │ └── .gitkeep ├── tests ├── __init__.py └── test_main.py ├── samples ├── mod │ └── .gitkeep ├── npz │ └── .gitkeep ├── uap │ └── .gitkeep ├── malware_set │ └── .gitkeep ├── successful │ ├── .gitkeep │ └── detected │ │ └── .gitkeep ├── unsuccessful │ └── .gitkeep └── rl │ └── evaluation_set │ └── .gitkeep ├── data ├── lgbm_ember.pkl ├── lgbm_sorel.pkl ├── gradient_boosting.pkl ├── section_names.txt ├── manipulate.py └── pefeatures.py ├── codecov.yml ├── src ├── config.py ├── setup.py ├── config.ini ├── plot.py ├── gp.py ├── defense.py ├── functions.py └── rl.py ├── requirements.txt ├── docs ├── Makefile ├── make.bat ├── conf.py └── index.rst ├── .github └── workflows │ └── main.yml ├── Install.md ├── main.py ├── README.md └── License /db/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /db/rl/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /samples/mod/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /samples/npz/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /samples/uap/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /samples/malware_set/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /samples/successful/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /samples/unsuccessful/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /samples/rl/evaluation_set/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /samples/successful/detected/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/lgbm_ember.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zRapha/FAME/HEAD/data/lgbm_ember.pkl -------------------------------------------------------------------------------- /data/lgbm_sorel.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zRapha/FAME/HEAD/data/lgbm_sorel.pkl -------------------------------------------------------------------------------- /data/gradient_boosting.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zRapha/FAME/HEAD/data/gradient_boosting.pkl -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | require_ci_to_pass: true 3 | 4 | coverage: 5 | precision: 2 6 | round: down 7 | range: "70...100" 8 | 9 | parsers: 10 | gcov: 11 | branch_detection: 12 | conditional: yes 13 | loop: yes 14 | method: no 15 | macro: no 16 | 17 | comment: 18 | layout: "reach,diff,flags,files,footer" 19 | behavior: default 20 | require_changes: false 21 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import configparser 3 | 4 | 5 | def load_config(c): 6 | """ 7 | Load configuration data. 8 | """ 9 | 10 | config = '' 11 | try: 12 | path = os.path.dirname(os.path.realpath(__file__)) 13 | f = '/'.join([path, c]) 14 | config = configparser.ConfigParser() 15 | config.read(f) 16 | except Exception as e: 17 | print("Error: {}".format(e)) 18 | return config 19 | 20 | 21 | file = load_config('config.ini') -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Requirements file to install all packages needed for FAME: 2 | # $ python3.7 -m venv fame-env & pip install -r requirements.txt 3 | 4 | numpy==1.19.0 5 | pandas==0.25.0 6 | requests==2.28.2 7 | scikit-learn==0.21.2 8 | scipy== 1.5.1 9 | lief==0.10.1 10 | lightgbm==2.3.1 11 | joblib==1.2.0 12 | chainer==7.8.0 13 | chainerrl==0.8.0 14 | pytest==6.2.5 15 | coverage==6.0 16 | tqdm~=4.62.3 17 | sphinx==4.2.0 18 | 19 | gym~=0.19.0 20 | setuptools~=57.0.0 21 | -------------------------------------------------------------------------------- /data/section_names.txt: -------------------------------------------------------------------------------- 1 | .text 2 | .rsrc 3 | .reloc 4 | .data 5 | .rdata 6 | .idata 7 | .tls 8 | .brdata 9 | .bss 10 | .pdata 11 | .xdata 12 | DATA 13 | CODE 14 | BSS 15 | rdata 16 | .rmnet 17 | .CRT 18 | .edata 19 | .extrel 20 | .sdata 21 | .code 22 | .vmp0 23 | .itext 24 | .data2 25 | .data1 26 | .vmp1 27 | .adata 28 | .gfids 29 | .data3 30 | INIT 31 | .extjmp 32 | .didat 33 | .didata 34 | PAGE 35 | .orpc 36 | vryeypb 37 | camztlf 38 | tkjdelw 39 | dgbwqbp 40 | odyqxub 41 | .tsuarch 42 | .tsustub 43 | .textbss 44 | .sxdata 45 | .zrdata 46 | qxejodg 47 | .data-co 48 | .text-co 49 | gumrkvc 50 | rqvmxkb 51 | kakxcjb 52 | .cdata 53 | ExeS 54 | .rrdata -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /src/setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | from setuptools import setup 3 | 4 | def read_file(filename): 5 | with io.open(filename, mode='r', encoding='utf-8') as fd: 6 | return fd.read() 7 | 8 | setup( 9 | name='FAMEwork', 10 | version='0.1.5', 11 | use_scm_version=False, 12 | setup_requires=['setuptools_scm'], 13 | include_package_data=True, 14 | packages=['.'], 15 | install_requires=[ 16 | 'numpy==1.19.0', 17 | 'pandas==0.25.0', 18 | 'requests==2.28.2', 19 | 'scikit-learn==0.21.2', 20 | 'scipy== 1.5.1', 21 | 'lief==0.10.1', 22 | 'lightgbm==2.3.1', 23 | 'joblib==1.2.0', 24 | 'chainer==7.8.0', 25 | 'chainerrl==0.8.0', 26 | 'pytest==6.2.5', 27 | 'coverage==6.0', 28 | 'tqdm~=4.62.3', 29 | 'sphinx==4.2.0', 30 | 'gym~=0.19.0', 31 | 'setuptools~=57.0.0'], 32 | url='https://github.com/zRapha/FAME', 33 | license='MPL-2.0', 34 | author='Raphael Labaca Castro', 35 | author_email='mail@rapha.ai', 36 | description='Framework for Adversarial Malware Evaluation', 37 | long_description=read_file('PyPI.md'), 38 | long_description_content_type='text/markdown', 39 | platforms=['Fedora 30, Ubuntu 16'], 40 | entry_points={'console_scripts': ['fame = main:main',]} 41 | ) 42 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: FAME 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-22.04 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up Python 3.7.15 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: 3.7.15 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install pip==23.0.1 23 | pip install flake8 pytest 24 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 25 | - name: Lint with flake8 26 | run: | 27 | # stop the build if there are Python syntax errors or undefined names 28 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 29 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 30 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 31 | - name: Test with pytest 32 | run: | 33 | pytest 34 | - name: Generate coverage report 35 | run: | 36 | coverage run -m pytest 37 | coverage xml 38 | - name: Upload coverage to Codecov 39 | uses: codecov/codecov-action@v2 40 | with: 41 | fail_ci_if_error: true 42 | token: 9e4c2ef9-14f0-466b-b8b2-ba3eefddb8e4 43 | -------------------------------------------------------------------------------- /src/config.ini: -------------------------------------------------------------------------------- 1 | [armed] 2 | rounds=100 3 | perturbations=5 4 | advFilesExpected=100 5 | model=EMBER 6 | integrityCheck=False 7 | 8 | [aimed] 9 | rounds=100 10 | perturbations=5 11 | advFilesExpected=1 12 | sizePopulation=4 13 | model=EMBER 14 | integrityCheck=False 15 | searchUAP=False 16 | 17 | [aimedrl] 18 | perturbations=5 19 | model=EMBER 20 | train=True 21 | evaluate=True 22 | 23 | [gameup] 24 | perturbations=10 25 | model=EMBER 26 | integrityCheck=False 27 | 28 | [defense] 29 | perturbations=10 30 | model=EMBER 31 | 32 | [compare] 33 | rounds=100 34 | perturbations=5 35 | advFilesExpected=1 36 | model=EMBER 37 | 38 | [apiKeys] 39 | vt= 40 | ha= 41 | md= 42 | 43 | [paths] 44 | db=db/ 45 | npz=samples/npz/ 46 | mod=samples/mod/ 47 | fail=samples/unsuccessful/ 48 | evasion=samples/successful/ 49 | detected=samples/successful/detected/ 50 | malware_set=samples/malware_set/ 51 | 52 | exploration=samples/uap/greedy/EMBER/exploration_set/ 53 | validation=samples/uap/greedy/EMBER/validation_set/ 54 | 55 | rl=samples/rl/ 56 | report=db/rl/training_reports/last/ 57 | 58 | model_path = data/models/ 59 | vectorized_path = samples/ember/ 60 | 61 | [db] 62 | fields=['Original_File', 63 | 'OF_Detections', 64 | 'Manipulated_File', 65 | 'MF_Detections', 66 | 'Perturbations', 67 | 'Perturbations_Injected', 68 | 'Full_Detections_Report', 69 | 'Full_Analysis_Report', 70 | 'Mod_File_Hash', 71 | 'Original_File_Hash', 72 | 'Date_Reported'] 73 | 74 | [remote] 75 | useVT = False 76 | useHA = False 77 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'FAME' 21 | copyright = '2022, Raphael Labaca Castro' 22 | author = 'Raphael Labaca Castro' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | ] 32 | 33 | # Add any paths that contain templates here, relative to this directory. 34 | templates_path = ['_templates'] 35 | 36 | # List of patterns, relative to source directory, that match files and 37 | # directories to ignore when looking for source files. 38 | # This pattern also affects html_static_path and html_extra_path. 39 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 40 | 41 | 42 | # -- Options for HTML output ------------------------------------------------- 43 | 44 | # The theme to use for HTML and HTML Help pages. See the documentation for 45 | # a list of builtin themes. 46 | # 47 | html_theme = 'alabaster' 48 | 49 | # Add any paths that contain custom static files (such as style sheets) here, 50 | # relative to this directory. They are copied after the builtin static files, 51 | # so a file named "default.css" will overwrite the builtin "default.css". 52 | html_static_path = ['_static'] 53 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. FAMEwork documentation master file, created by 2 | sphinx-quickstart on Thu Oct 7 14:19:09 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ==================================== 7 | Welcome to FAME's documentation! 8 | ==================================== 9 | 10 | FAME was designed to understand how byte-level transformations could automatically be injected to Windows Portable Executable (PE) files and compromise ML-based malware classifiers. Moreover, it supports integrity verification to ensure that the new adversarial examples are valid. This work implements the action space proposed on the [OpenAI gym malware](https://github.com/endgameinc/gym-malware) environment. It has been implemented in Fedora 30 and tested on Ubuntu 16 using Python3. Library versions are defined in requirements.txt file. 11 | 12 | The framework consists of four modules, namely, ARMED, AIMED, AIMED-RL & GAME-UP 13 | 14 | GAME-UP: Generating Adversarial Malware Examples with Universal Perturbations 15 | 16 | This work intends to understand how Universal Adversarial Perturbations (UAPs) can be useful to create efficient adversarial examples compared to input-specific attacks. Furthermore, it explores how real malware examples in the problem-space affect the feature-space of classifiers to identify systematic weaknesses. Also, it implements a variant of adversarial training to improve the resilience of static ML-based malware classifiers for Windows PE binaries. 17 | 18 | AIMED-RL: Automatic Intelligent Modifications to Evade Detection (with Reinforcement Learning) 19 | 20 | This work is focused on understanding how sensitive static malware classifiers are to adversarial examples. It uses different techniques including Genetic Programming (GP) and Reinforcement Learning (RL) to inject perturbations to Windows portable executable malware without compromising its functionality and, thus, keeping the new generated adversarial example valid. 21 | 22 | .. toctree:: 23 | :maxdepth: 2 24 | :caption: Contents: 25 | 26 | 27 | 28 | Indices and tables 29 | ================== 30 | 31 | * :ref:`genindex` 32 | * :ref:`modindex` 33 | * :ref:`search` 34 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import src.config as cfg 3 | 4 | 5 | class TestMethods(unittest.TestCase): 6 | def test_armed_config(self): 7 | assert cfg.file.getint('armed', 'perturbations') > 0 8 | assert cfg.file.getint('armed', 'advFilesExpected') > 0 9 | assert cfg.file.getint('armed', 'rounds') >= cfg.file.getint('armed', 'advFilesExpected') 10 | assert cfg.file['armed']['model'] == "EMBER" or cfg.file['armed']['model'] == "SOREL" 11 | assert cfg.file.getboolean('armed', 'integrityCheck') is True or \ 12 | cfg.file.getboolean('armed', 'integrityCheck') is False 13 | 14 | def test_aimed_config(self): 15 | assert cfg.file.getint('aimed', 'perturbations') > 0 16 | assert cfg.file.getint('aimed', 'advFilesExpected') > 0 17 | assert cfg.file.getint('aimed', 'sizePopulation') >= 2 18 | assert cfg.file['aimed']['model'] == "EMBER" or cfg.file['aimed']['model'] == "SOREL" 19 | assert cfg.file.getboolean('aimed', 'integrityCheck') is True or \ 20 | cfg.file.getboolean('aimed', 'integrityCheck') is False 21 | 22 | def test_aimedrl_config(self): 23 | assert cfg.file.getint('aimedrl', 'perturbations') > 0 24 | assert cfg.file['aimedrl']['model'] == "EMBER" or cfg.file['aimedrl']['model'] == "SOREL" 25 | assert cfg.file.getboolean('aimedrl', 'train') is True or \ 26 | cfg.file.getboolean('aimedrl', 'train') is False 27 | assert cfg.file.getboolean('aimedrl', 'evaluate') is True or \ 28 | cfg.file.getboolean('aimedrl', 'evaluate') is False 29 | 30 | def test_gameup_config(self): 31 | assert cfg.file.getint('gameup', 'perturbations') > 0 32 | assert cfg.file['gameup']['model'] == "EMBER" or cfg.file['gameup']['model'] == "SOREL" 33 | assert cfg.file.getboolean('gameup', 'integrityCheck') is True or \ 34 | cfg.file.getboolean('gameup', 'integrityCheck') is False 35 | 36 | def test_defense_config(self): 37 | assert cfg.file.getint('defense', 'perturbations') > 0 38 | assert cfg.file['defense']['model'] == "EMBER" or cfg.file['defense']['model'] == "SOREL" 39 | 40 | def test_compare_config(self): 41 | assert cfg.file.getint('compare', 'perturbations') > 0 42 | assert cfg.file.getint('compare', 'advFilesExpected') > 0 43 | assert cfg.file.getint('compare', 'rounds') >= cfg.file.getint('compare', 'advFilesExpected') 44 | assert cfg.file['compare']['model'] == "EMBER" or cfg.file['compare']['model'] == "SOREL" 45 | 46 | 47 | if __name__ == '__main__': 48 | unittest.main() 49 | -------------------------------------------------------------------------------- /Install.md: -------------------------------------------------------------------------------- 1 | # Installation instructions (dev mode) 2 | 3 | Clone the FAME repository: 4 | ``` 5 | git clone git@github.com:zRapha/FAME.git 6 | ``` 7 | Create a virtual environment & activate it: 8 | ``` 9 | python3.7 -m venv fame-env 10 | source fame-env/bin/activate 11 | ``` 12 | Update pip if needed (pip<=23.0.1): 13 | ``` 14 | python -m pip install pip==23.0.1 15 | ``` 16 | 17 | Install required packages: 18 | ``` 19 | pip install -r requirements.txt 20 | ``` 21 | ## Integrity test verification 22 | Per default the functionality stage is implemented using Cuckoo, an analysis environment that has an extensive [documentation](https://cuckoo.readthedocs.io/en/latest/introduction/what/). Cuckoo provides dynamic analysis results, which can be useful to understand the adversarial examples generated. A local beta-test implementation is also provided for further extension. 23 | 24 | ## Malware classification 25 | Local classification models are implemented to perform detection using pre-trained malware classifier, namely, LightGBM trained with both EMBER and SOREL datasets. For those interested in more classifiers, we provide the option of using aggregators via REST APIs in order to assess adversarial examples against a wider range of commercial engines. 26 | 27 | ## Dataset 28 | There are several public repositories containing labeled malicious files to test the environment. Once the data is acquired, it should be placed under the `samples/malware_set/` folder. 29 | 30 | ## Further environment isolation [optional] 31 | Even though the manipulations do not require to run any file, the integrity verification stage does. Hence, it is recommended to use isolated sandboxes and simulated services. One option is to use _inetsim_. 32 | 33 | Disable interface: 34 | ``` 35 | sudo ifconfig down 36 | ``` 37 | 38 | Run inetsim (tested version 1.2.8): 39 | ``` 40 | cd /etc/default/inetsim/ 41 | sudo ./inetsim 42 | ``` 43 | 44 | Note that automatically retrieving the detection rate for a malware file from an online aggregator will no longer be functional unless adjusted manually. 45 | 46 | ## How to run FAME 47 | 48 | ### 1. Activate Cuckoo Python venv: 49 | ``` 50 | source ~/cuckoo-env/bin/activate 51 | ``` 52 | 53 | > If integrity verification is implemented proceed with _2_, otherwise jump to _5_. 54 | 55 | ### 2. Run Mongo DB for webserver: 56 | ``` 57 | sudo service mongod start 58 | ``` 59 | 60 | ### 3. Run webserver [optional]: 61 | ``` 62 | cd ~/.cuckoo/ 63 | cuckoo web 64 | ``` 65 | 66 | ### 4. Run API & Cuckoo sandbox: 67 | ``` 68 | cuckoo api 69 | cuckoo 70 | ``` 71 | 72 | ### 5. Adjust configuration and initial parameters: 73 | ``` 74 | vi config.ini 75 | ``` 76 | 77 | ### 6. Run FAME: 78 | ``` 79 | ./main.py aimed 80 | ``` 81 | 82 | ## Segmentation fault 83 | We have observed that injecting some combinations of perturbations to specific PE files raise segmentation fault 84 | issues. Due to the nature of memory violations and the occurrence of this issue (in our experiments less than 0.02% of 85 | the cases) we recommend either adjusting the transformations' sequence to a different combination or trying a new example. 86 | Sometimes not patching the original import table, setting `builder.patch_imports(False)` may also help prevent this issue. 87 | A workaround is curating the dataset by identifying the PE file and excluding it from the process. 88 | 89 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Welcome to the Framework for Adversarial Malware Evaluation (FAME) 4 | 5 | FAME was designed to understand how byte-level transformations could automatically be injected to Windows Portable 6 | Executable (PE) files and compromise ML-based malware classifiers. Moreover, it supports integrity verification to 7 | ensure that the new adversarial examples are valid. This work implements the action space proposed on the OpenAI gym 8 | malware environment. It has been implemented in Fedora 30 and tested on Ubuntu 16 using Python3. Library versions are 9 | defined in requirements.txt file. 10 | 11 | The following modules are available: ARMED, AIMED, AIMED-RL & GAME-UP 12 | 13 | GAME-UP: Generating Adversarial Malware Examples with Universal Perturbations 14 | 15 | This work intends to understand how Universal Adversarial Perturbations (UAPs) can be useful to create efficient 16 | adversarial examples compared to input-specific attacks. Furthermore, it explores how real malware examples in the 17 | problem-space affect the feature-space of classifiers to identify systematic weaknesses. Also, it implements a variant 18 | of adversarial training to improve the resilience of static ML-based malware classifiers for Windows PE binaries. 19 | 20 | AIMED-RL: Automatic Intelligent Modifications to Evade Detection (with Reinforcement Learning) 21 | 22 | This work is focused on understanding how sensitive static malware classifiers are to adversarial examples. It uses 23 | different techniques including Genetic Programming (GP) and Reinforcement Learning (RL) to inject perturbations to 24 | Windows portable executable malware without compromising its functionality and, thus, keeping the new generated 25 | adversarial example valid. 26 | 27 | """ 28 | 29 | import sys 30 | import time 31 | import src.config as cfg 32 | import src.functions as f 33 | import src.implementation as i 34 | 35 | 36 | def main(argv=sys.argv[1]): 37 | option = argv.upper() 38 | 39 | # Time algorithm 40 | start = time.time() 41 | 42 | # ARMED: Finding adversarial malware examples stochastically 43 | if option == 'ARMED': 44 | i.armed(number_perturbations=cfg.file.getint('armed', 'perturbations'), 45 | rounds=cfg.file.getint('armed', 'rounds'), files_expected=cfg.file.getint('armed', 'advFilesExpected'), 46 | model=cfg.file['armed']['model']) 47 | 48 | # ARMED II: Using Incremental Iterations of perturbations' sequence 49 | elif option == 'ARMED-II': 50 | i.armed2(number_perturbations=cfg.file.getint('armed', 'perturbations'), 51 | rounds=cfg.file.getint('armed', 'rounds'), 52 | files_expected=cfg.file.getint('armed', 'advFilesExpected'), 53 | model=cfg.file['armed']['model']) 54 | 55 | # AIMED: Finding adversarial examples with genetic programming 56 | elif option == 'AIMED': 57 | i.aimed(size_population=cfg.file.getint('aimed', 'sizePopulation'), 58 | number_perturbations=cfg.file.getint('aimed', 'perturbations'), 59 | model=cfg.file['aimed']['model']) 60 | 61 | # AIMED-RL: Finding adversarial examples with reinforcement learning 62 | elif option == 'AIMED-RL': 63 | i.aimed_rl(base_path=cfg.file['paths']['rl'], 64 | report_path=cfg.file['paths']['report'], 65 | train=cfg.file.getboolean('aimedrl', 'train'), 66 | evaluate=cfg.file.getboolean('aimedrl', 'evaluate')) 67 | 68 | # GAME-UP: Find universal perturbation sequences to generate adversarial examples 69 | elif option == 'GAMEUP': 70 | i.gameup(number_perturbations=cfg.file.getint('gameup', 'perturbations'), model=cfg.file['gameup']['model'], 71 | exploration_set=cfg.file['paths']['exploration'],) 72 | 73 | # UAP-DEF: Use UAPs to increase resilience of models against universal attacks 74 | elif option == 'DEFENSE': 75 | i.defense(number_perturbations=cfg.file.getint('defense', 'perturbations'), 76 | model=cfg.file['defense']['model']) 77 | 78 | # COMPARE: Evaluate different algorithms (Example imp.: AIMED vs ARMED) 79 | elif option == 'COMPARE': 80 | i.comparing(number_perturbations=cfg.file.getint('compare', 'perturbations'), 81 | rounds=cfg.file.getint('compare', 'rounds'), 82 | files_expected=cfg.file.getint('compare', 'advFilesExpected'), 83 | model=cfg.file['compare']['model']) 84 | 85 | else: 86 | exit('Option not found!') 87 | 88 | f.time_me(start) 89 | 90 | 91 | if __name__ == '__main__': 92 | main() 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FAME 2 | 3 | ![Workflow](https://github.com/zrapha/fame/actions/workflows/main.yml/badge.svg) 4 | [![codecov](https://codecov.io/gh/zRapha/famework/branch/master/graph/badge.svg?token=oMFazw4iLl)](https://codecov.io/gh/zRapha/famework) 5 | [![License: MPL v2](https://img.shields.io/badge/license-MPL--2.0-blue.svg)](https://www.mozilla.org/en-US/MPL/2.0/) 6 | 7 | 8 | 11 | 12 | ## Welcome to the Framework for Adversarial Malware Evaluation 13 | 14 | FAME has been designed to evaluate ML-based malware classifiers against adversarial examples. It aims to provide understanding on how byte-level transformations can be injected into Windows Portable Executable (PE) files and compromise models. Moreover, it supports integrity verification to ensure that the adversarial examples remain valid after manipulation. This work implements the action space proposed on the [OpenAI gym malware](https://github.com/endgameinc/gym-malware) environment. It has been implemented and tested using Fedora 30 and Ubuntu 16 with Python3. Library versions are defined in the `requirements.txt` file. 15 | 16 | The framework consists of the following modules: ARMED, AIMED / AIMED-RL & GAME-UP. 17 | 18 | ### GAME-UP: Generating Adversarial Malware Examples with Universal Perturbations 19 | 20 | This module intends to analyze how Universal Adversarial Perturbations (UAPs) can be useful to create efficient adversarial examples compared to input-specific attacks. It explores how real malware examples in the problem-space affect the feature-space of classifiers to identify systematic weaknesses. Also, it implements a variant of adversarial training to improve the resilience of static ML-based malware classifiers for Windows PE binaries. 21 | 22 | ### AIMED: Automatic Intelligent Modifications to Evade Detection 23 | 24 | This approach focus on understanding how sensitive static malware classifiers are to adversarial examples. It uses different techniques including Genetic Programming (GP) and Reinforcement Learning (RL) to inject perturbations to Windows PE malware without compromising its functionality, keeping the frehsly generated adversarial example valid. 25 | 26 | ### ARMED: Automatic Random Modifications to Evade Detection 27 | 28 | With this option sequences of transformations are chosen randomly to identify weakspots in the classifier. This module implements a pipeline that is able to automatically generate realizable adversarial examples in the malware context. 29 | 30 | ## How to run FAME 31 | 32 | Here we describe how to run `FAME` by installing directly the package from `pip`. For more detail about running from source and manual configuration of parameters refer to the [install](https://github.com/zRapha/FAME/blob/master/Install.md) instructions. 33 | 34 | Install `FAME`: 35 | ``` 36 | pip install famework 37 | ``` 38 | Run `FAME` with any module (e.g., AIMED): 39 | ``` 40 | fame aimed 41 | ``` 42 | 43 | ## Contributors 44 | 45 | We appreciate the contributions that helped to improve this work: 46 | 47 | | Contributor | University | Module | 48 | |-----------------|--------------------------------|------------------------| 49 | | Sebastian Franz | Technische Universität München | Reinforcement Learning | 50 | 51 | ## Citation 52 | 53 | If you find this work useful you are highly encouraged to cite the following articles. For the framework, you can refer to my dissertation: 54 | 55 | `FAME` 56 | ``` 57 | @book{labaca-castro2023fame, 58 | title={Machine Learning under Malware Attack}, 59 | author={Labaca-Castro, Raphael}, 60 | year={2023}, 61 | publisher={Springer Nature} 62 | } 63 | ``` 64 | --- 65 | If you worked with more specific modules feel free to reference them separately: 66 | 67 | `GAME-UP` 68 | ``` 69 | @article{labaca-castro2022universal, 70 | title={Realizable Universal Adversarial Perturbations for Malware}, 71 | author={Labaca-Castro, Raphael and Mu{\~n}oz-Gonz{\'a}lez, Luis and Pendlebury, Feargus and Rodosek, Gabi Dreo and Pierazzi, Fabio and Cavallaro, Lorenzo}, 72 | journal={arXiv preprint arXiv:2102.06747}, 73 | year={2022} 74 | } 75 | ``` 76 | 77 | `AIMED-RL` 78 | ``` 79 | @inproceedings{labaca-castro2021aimed-rl, 80 | title={AIMED-RL: Exploring Adversarial Malware Examples with Reinforcement Learning }, 81 | author={Labaca-Castro, Raphael and Franz, Sebastian and Rodosek, Gabi Dreo}, 82 | booktitle={Joint European Conference on Machine Learning and Knowledge Discovery in Databases (ECML PKDD)}, 83 | pages={37--52}, 84 | year={2021}, 85 | organization={Springer} 86 | } 87 | ``` 88 | 89 | `AIMED` 90 | ``` 91 | @inproceedings{labaca-castro2019aimed, 92 | title={AIMED: Evolving Malware with Genetic Programming to Evade Detection}, 93 | author={Labaca-Castro, Raphael and Schmitt, Corinna and Rodosek, Gabi Dreo}, 94 | booktitle={2019 18th IEEE International Conference On Trust, Security And Privacy In Computing And Communications/13th IEEE International Conference On Big Data Science And Engineering (TrustCom/BigDataSE)}, 95 | pages={240--247}, 96 | year={2019}, 97 | organization={IEEE} 98 | } 99 | ``` 100 | 101 | `ARMED` 102 | ``` 103 | @inproceedings{labaca-castro2019armed, 104 | title={ARMED: How Automatic Malware Modifications Can Evade Static Detection?}, 105 | author={Labaca-Castro, Raphael and Schmitt, Corinna and Rodosek, Gabi Dreo}, 106 | booktitle={2019 5th International Conference on Information Management (ICIM)}, 107 | pages={20--27}, 108 | year={2019}, 109 | organization={IEEE} 110 | } 111 | ``` 112 | -------------------------------------------------------------------------------- /src/plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import matplotlib 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | from collections import Counter 7 | 8 | matplotlib.use('Agg') 9 | 10 | 11 | def csv_into_list(CSV, sample): 12 | # Setting fields for CSV 13 | fields = ['Original_File', 'OF_Detections', 'Manipulated_File', 'MF_Detections', 'Perturbations', 14 | 'Perturbations_Injected', 15 | 'Full_Detections_Report', 'Full_Analysis_Report', 'Mod_File_Hash', 'Original_File_Hash', 'Date_Reported'] 16 | 17 | # Retrieve database 18 | df = pd.read_csv(CSV, names=fields, header=None) 19 | 20 | # Use only rows about %sample% 21 | df = df.loc[df['Original_File'] == 'samples/' + sample] 22 | if df.empty: 23 | print('No samples found with that name in the database.') 24 | quit() 25 | 26 | # Identifying x, y & cleaning out detections values. Only for success 27 | # samples otherwise just get perturbations as there is no detections 28 | if not df['MF_Detections'].isnull().any(): 29 | # print(df['MF_Detections']) 30 | df['MF_Detections'] = df['MF_Detections'].map(lambda x: x[:2]) 31 | detections = df['MF_Detections'].values.tolist() 32 | 33 | for i in range(len(detections)): 34 | if '/' in detections[i]: 35 | detections[i] = detections[i][:1] 36 | 37 | # Merging both structures into one list skipping headers 38 | perts_and_detections = list(map(list, zip(df['Perturbations'][0:], detections[0:]))) 39 | 40 | else: 41 | perts_and_detections = list(df['Perturbations'][1:]) 42 | 43 | # Retrieving detections ratio for original file 44 | benchmark = df['OF_Detections'].values[0] 45 | 46 | return perts_and_detections, benchmark[:2] 47 | 48 | 49 | def accumulative_counter(perts_and_detections): 50 | keys = map(str, list(range(26)))[1:] 51 | accumulative_dict = {key: 0 for key in keys} 52 | counter_dict = {key: 0 for key in keys} 53 | 54 | val1, val2 = perts_and_detections[:2] 55 | if val1 != val2: # Check whether database.csv (successes) or fail_database.csv (fails) is handed 56 | for i in range(len(perts_and_detections)): 57 | accumulative_dict[perts_and_detections[i][0]] = accumulative_dict[perts_and_detections[i][0]] + \ 58 | int(perts_and_detections[i][1]) 59 | counter_dict[perts_and_detections[i][0]] = counter_dict[perts_and_detections[i][0]] + 1 60 | else: 61 | c = Counter(perts_and_detections) 62 | c = {int(k): int(v) for k, v in c.items()} 63 | counter_dict = dict(sorted(c.items())) 64 | 65 | # Removing keys with zero values to avoid ZeroDivisionError 66 | accumulative_dict = {k: v for k, v in accumulative_dict.items() if v != 0} 67 | counter_dict = {k: v for k, v in counter_dict.items() if v != 0} 68 | 69 | return accumulative_dict, counter_dict, len(keys) 70 | 71 | 72 | def string_to_int_list(perts_and_detections): 73 | # Converting str list into int list 74 | list_perts = [int(a) for a, b in perts_and_detections] 75 | list_detections = [int(b) for a, b in perts_and_detections] 76 | new_list = sorted(list(map(list, zip(list_perts, list_detections)))) 77 | 78 | return new_list 79 | 80 | 81 | def det_vs_pert(CSV, sample): 82 | # Converting CSV into list of list 83 | perts_and_detections, benchmark = csv_into_list(CSV, sample) 84 | 85 | # Calculating accumulator and counter of detections based on perturbations injected 86 | accumulative_dict, counter_dict, len_keys = accumulative_counter(perts_and_detections) 87 | print('Number of samples per injection:\n{}'.format(counter_dict)) 88 | 89 | # Building final dict with perturbations as key and average of detections as values & then sort 90 | avg_dict = {int(k): round(accumulative_dict[k] / counter_dict[k]) for k in accumulative_dict.keys() & counter_dict} 91 | avg_dict = dict(sorted(avg_dict.items())) 92 | 93 | # Defining x & y 94 | x = list(avg_dict.keys()) 95 | y = list(avg_dict.values()) 96 | 97 | # Plot ARMED's new mutations performance 98 | plt.figure() 99 | ax = plt.gca() 100 | plt.plot(x[:len_keys], y[:len_keys], c='b', label='Average') # 'Mutations') 101 | 102 | # Formatting 103 | ax.set_xlim(2, len_keys) 104 | ax.set_ylim(0, 57) 105 | # ax.xaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}')) 106 | # ax.set_title("ARMED: Average Detections of New Mutations [n={}]".format(len(perts_and_detections))) 107 | ax.set_xlabel('Perturbations') 108 | ax.set_ylabel('Average of VirusTotal Detections') 109 | plt.hlines(y=int(benchmark), colors='r', xmin=0, xmax=len_keys, linestyles='dashed') # , label='Original file') 110 | plt.legend(loc=1) 111 | plt.savefig('graphics/' + sample + '/VTEvsPI.png') 112 | 113 | 114 | def scatter_plot(CSV, sample): 115 | # Converting CSV into list of list 116 | perts_and_detections, benchmark = csv_into_list(CSV, sample) 117 | 118 | # Converting str list into int list 119 | new_list = string_to_int_list(perts_and_detections) 120 | 121 | # Defining x, y and N 122 | x = [int(a) for a, b in new_list] 123 | y = [int(b) for a, b in new_list] 124 | area = 10 125 | 126 | # Plot each mutated sample in ARMED database 127 | ax = plt.gca() 128 | plt.scatter(x[:300], y[:300], s=area, c='black', alpha=0.5, label="Mutations (S')") 129 | plt.hlines(y=int(benchmark), colors='r', xmin=0, xmax=22, linestyles='dashed', label="Original (S)") 130 | 131 | # General formatting 132 | ax.set_xlim(1.9, 25.1) 133 | # ax.xaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}')) 134 | ax.set_ylim(0, 57) 135 | # ax.set_title("ARMED: Distribution of Mutations [n={}]".format(N)) 136 | ax.set_xlabel('Number of perturbations injected') 137 | ax.set_ylabel('Number of detection engines') # (max = 68)') 138 | ax.legend(loc='upper center', bbox_to_anchor=(0.576, 1.01), fancybox=False, shadow=False, ncol=5) 139 | plt.savefig('graphics/' + sample + '/scatter_plot.png') 140 | 141 | 142 | def ratio_functional(CSV, CSV_fail, sample): 143 | # Converting CSV into list of list 144 | perts_and_detections, benchmark = csv_into_list(CSV, sample) 145 | perts_fail, benchmark_fail = csv_into_list(CSV_fail, sample) 146 | 147 | # Counting detections of manipulated sample based on perturbations injected 148 | _, counter_dict, len_keys = accumulative_counter(perts_and_detections) 149 | _, counter_dict_fail, _ = accumulative_counter(perts_fail) 150 | # print('Number of samples per injection: (fail database)\n{}'.format(counter_dict_fail)) 151 | 152 | # Defining y vars 153 | y = list(counter_dict.values()) 154 | y_fail = list(counter_dict_fail.values()) 155 | 156 | # Plot ration of functional vs. non-functional 157 | plt.figure() 158 | 159 | # Values of each bar 160 | bars_s = y[:len_keys] 161 | bars_f = y_fail[:len_keys] 162 | sum_y = sum(y_fail[:len_keys]) + sum(y[:len_keys]) 163 | 164 | # Check 10 perts were injected and all p > 10 165 | if len(bars_s) < 10 or y[0] < 10: 166 | print('Not enough data for bar plot yet') 167 | quit() 168 | 169 | # Position of bars on x-axis 170 | r = list(range(24)) 171 | 172 | # Names of group and bar width 173 | names = map(str, list(range(26)))[2:] 174 | barWidth = 0.85 175 | 176 | # Create successful & failed mutations bars 177 | plt.bar(r, bars_s, color='darkgray', edgecolor='white', width=barWidth, label='Functional') 178 | plt.bar(r, bars_f, bottom=bars_s, color='gray', edgecolor='white', width=barWidth, label='Non-functional') 179 | 180 | # Formatting 181 | # plt.title("ARMED: Functional vs. Non-functional Mutations [n={}]".format(sum_y)) 182 | plt.hlines(y=sum_y / len(y[:len_keys]), colors='r', xmin=0, xmax=len_keys - 2, linestyles='dashed', label='Average') 183 | plt.ylim(0, max(y_fail) + y[0] + 5) 184 | plt.xticks(r, names) 185 | plt.xlabel("Number of perturbations injected") 186 | plt.ylabel("Number of mutations generated") 187 | plt.legend(loc=2) 188 | plt.savefig('graphics/' + sample + '/ratio_functional.png') 189 | 190 | 191 | if __name__ == '__main__': 192 | det_vs_pert('db/database.csv', 'original/keylogger') 193 | scatter_plot('db/database.csv', 'original/keylogger') 194 | ratio_functional('db/database.csv', 'db/fail_database.csv', 'original/keylogger') 195 | -------------------------------------------------------------------------------- /data/manipulate.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/endgameinc/gym-malware 2 | 3 | import lief 4 | import json 5 | import os 6 | import sys 7 | import array 8 | import struct # byte manipulations 9 | import random 10 | import tempfile 11 | import subprocess 12 | import functools 13 | import signal 14 | import multiprocessing 15 | 16 | MODULE_PATH = os.path.split(os.path.abspath(sys.modules[__name__].__file__))[0] 17 | 18 | COMMON_SECTION_NAMES = open(os.path.join( 19 | MODULE_PATH, 'section_names.txt'), 'r').read().rstrip().split('\n') 20 | 21 | COMMON_IMPORTS = json.load( 22 | open(os.path.join(MODULE_PATH, 'small_dll_imports.json'), 'r')) 23 | 24 | 25 | class MalwareManipulator(object): 26 | def __init__(self, bytez): 27 | self.bytez = bytez 28 | self.min_append_log2 = 5 29 | self.max_append_log2 = 8 30 | 31 | def __random_length(self): 32 | return 2**random.randint(self.min_append_log2, self.max_append_log2) 33 | 34 | def __binary_to_bytez(self, binary, dos_stub=False, imports=False, overlay=False, relocations=False, resources=False, tls=False): 35 | builder = lief.PE.Builder(binary) # write the file back as bytez 36 | if(dos_stub): 37 | builder.build_dos_stub(dos_stub) # rebuild DOS stub 38 | if(imports): 39 | builder.build_imports(imports) # rebuild IAT in another section 40 | builder.patch_imports(imports) # patch orig. import table with trampolines to new import table 41 | if(overlay): 42 | builder.build_overlay(overlay) # rebuild overlay 43 | if(relocations): 44 | builder.build_relocations(relocations) # rebuild relocation table in another section 45 | if(resources): 46 | builder.build_resources(resources) # rebuild resources in another section 47 | if(tls): 48 | builder.build_tls(tls) # rebuilt TLS object in another section 49 | builder.build() # perform the build process 50 | return array.array('B', builder.get_build()).tobytes() 51 | 52 | def overlay_append(self, seed=None): 53 | random.seed(seed) 54 | L = self.__random_length() 55 | # choose the upper bound for a uniform distribution in [0,upper] 56 | upper = random.randrange(256) 57 | # upper chooses the upper bound on uniform distribution: 58 | # upper=0 would append with all 0s 59 | # upper=126 would append with "printable ascii" 60 | # upper=255 would append with any character 61 | return self.bytez + bytes([random.randint(0, upper) for _ in range(L)]) 62 | 63 | def imports_append(self, seed=None): 64 | # add (unused) imports 65 | random.seed(seed) 66 | binary = lief.PE.parse(list(self.bytez)) 67 | # draw a library at random 68 | libname = random.choice(list(COMMON_IMPORTS.keys())) 69 | funcname = random.choice(list(COMMON_IMPORTS[libname])) 70 | lowerlibname = libname.lower() 71 | # find this lib in the imports, if it exists 72 | lib = None 73 | for im in binary.imports: 74 | if im.name.lower() == lowerlibname: 75 | lib = im 76 | break 77 | if lib is None: 78 | # add a new library 79 | lib = binary.add_library(libname) 80 | # get current names 81 | names = set([e.name for e in lib.entries]) 82 | if not funcname in names: 83 | lib.add_entry(funcname) 84 | 85 | self.bytez = self.__binary_to_bytez(binary,imports=True) 86 | 87 | return self.bytez 88 | 89 | def section_rename(self, seed=None): 90 | # rename a random section 91 | random.seed(seed) 92 | binary = lief.PE.parse(list(self.bytez)) 93 | targeted_section = random.choice(binary.sections) 94 | targeted_section.name = random.choice(COMMON_SECTION_NAMES)[:7] #actual version of lief not allowing 8 chars? 95 | 96 | self.bytez = self.__binary_to_bytez(binary) 97 | 98 | return self.bytez 99 | 100 | def section_add(self, seed=None): 101 | random.seed(seed) 102 | binary = lief.PE.parse(list(self.bytez)) 103 | new_section = lief.PE.Section( 104 | "".join(chr(random.randrange(ord('.'), ord('z'))) for _ in range(6))) 105 | 106 | # fill with random content 107 | upper = random.randrange(256) 108 | L = self.__random_length() 109 | new_section.content = [random.randint(0, upper) for _ in range(L)] 110 | 111 | new_section.virtual_address = max( 112 | [s.virtual_address + s.size for s in binary.sections]) 113 | # add a new empty section 114 | 115 | binary.add_section(new_section, 116 | random.choice([ 117 | lief.PE.SECTION_TYPES.BSS, 118 | lief.PE.SECTION_TYPES.DATA, 119 | lief.PE.SECTION_TYPES.EXPORT, 120 | lief.PE.SECTION_TYPES.IDATA, 121 | lief.PE.SECTION_TYPES.RELOCATION, 122 | lief.PE.SECTION_TYPES.RESOURCE, 123 | lief.PE.SECTION_TYPES.TEXT, 124 | lief.PE.SECTION_TYPES.TLS_, 125 | lief.PE.SECTION_TYPES.UNKNOWN, 126 | ])) 127 | 128 | self.bytez = self.__binary_to_bytez(binary) 129 | return self.bytez 130 | 131 | def section_append(self, seed=None): 132 | # append to a section (changes size and entropy) 133 | random.seed(seed) 134 | binary = lief.PE.parse(list(self.bytez)) 135 | targeted_section = random.choice(binary.sections) 136 | L = self.__random_length() 137 | available_size = targeted_section.size - len(targeted_section.content) 138 | if L > available_size: 139 | L = available_size 140 | 141 | upper = random.randrange(256) 142 | targeted_section.content = targeted_section.content + \ 143 | [random.randint(0, upper) for _ in range(L)] 144 | 145 | self.bytez = self.__binary_to_bytez(binary) 146 | return self.bytez 147 | 148 | # def section_reorder(self,param,seed=None): 149 | # # reorder directory of sections 150 | # pass 151 | 152 | def create_new_entry(self, seed=None): 153 | # create a new section with jump to old entry point, and change entry point 154 | # DRAFT: this may have a few technical issues with it (not accounting for relocations), but is a proof of concept for functionality 155 | random.seed(seed) 156 | 157 | binary = lief.PE.parse(list(self.bytez)) 158 | 159 | # get entry point 160 | entry_point = binary.optional_header.addressof_entrypoint 161 | 162 | # get name of section 163 | entryname = binary.section_from_rva(entry_point).name 164 | 165 | # create a new section 166 | new_section = lief.PE.Section(entryname + "".join(chr(random.randrange( 167 | ord('.'), ord('z'))) for _ in range(3))) # e.g., ".text" + 3 random characters 168 | # push [old_entry_point]; ret 169 | new_section.content = [ 170 | 0x68] + list(struct.pack("= 100: 144 | for z in range(2, self.size_population - 1): 145 | if self.members[z].code != self.members[elem].code and self.members[z].code != self.members[elem+1].code: 146 | self.members[elem] = self.members[z] 147 | break 148 | 149 | # Show updated population 150 | print('\n# Population: ', end='') 151 | [print(self.members[s].code, round(self.members[s].cost, 4), end=' # ') for s in range(len(self.members))] 152 | print('\n') 153 | 154 | def listEvasions(self): 155 | 156 | """ Show evasive members """ 157 | 158 | sequence_list = [] 159 | [sequence_list.append(sequence) for sequence in self.mutations_processed if 160 | sequence[2] > 0 and sequence[0] not in sequence_list] 161 | return sequence_list 162 | 163 | def allEvasion(self): 164 | 165 | """ Check whether all members are evasive """ 166 | 167 | duplicates = [] 168 | if self.members[0].cost < 100: 169 | return False 170 | for z in range(len(self.members) - 1): 171 | if self.members[z].cost == self.members[z + 1].cost: 172 | pass 173 | else: 174 | return False 175 | 176 | # Create a list with only member.code to make it hashable 177 | for k in self.members: 178 | duplicates.append(k.code) 179 | 180 | # Make sure there are no duplicated genes in the population 181 | if len(set(map(tuple, duplicates))) == len(self.members): 182 | print('\nAll sequences in the population lead to evasive mutations!') 183 | print('\nPopulation: ', end='') 184 | [print(self.members[z].code, self.members[z].cost, end=' # ') for z in range(len(self.members))] 185 | return True 186 | 187 | def generation(self, file, actions, search_uap=False): 188 | 189 | # Run until termination criteria are met 190 | if search_uap: 191 | while not self._generation_uap(actions): 192 | pass 193 | else: 194 | while not self._generation(file, actions): 195 | pass 196 | 197 | # Once finished, show evasive sequences if any sorted by most evasive 198 | if self.new_evasions: 199 | list_evasions = sorted(self.listEvasions(), key=operator.itemgetter(2), reverse=True) 200 | number_fittest_evasions = math.floor(len(list_evasions) / 10) 201 | print('\nAll evasive sequences found: {}\n'.format(len(list_evasions))) 202 | print('Displaying only 10% of fittest evasions:') 203 | for seq in range(number_fittest_evasions): 204 | print('Sequence: {} -- Fitness: {} -- Evasions: {}'.format(list_evasions[seq][0], 205 | round(list_evasions[seq][1], 2), list_evasions[seq][2])) 206 | return list_evasions 207 | else: 208 | print('No evasive sequences found.') 209 | 210 | return 0 211 | 212 | def _generation(self, sample, actions): 213 | 214 | # Set UseVT to VirusTotal report 215 | useVT = cfg.file.getboolean('remote', 'useVT') 216 | 217 | # Call selection before breeding 218 | self.selection() 219 | 220 | # Breeding & mutating and adding children to the members list for Selection afterwards 221 | self.breed() 222 | 223 | gene_num = 0 224 | scanner = cfg.file['aimed']['model'] 225 | for member in self.members: 226 | existing_member = False 227 | 228 | # If mutation was processed retrieve fitness value & avoid processing again 229 | for x in range(len(self.mutations_processed)): 230 | if self.mutations_processed[x][0] == member.code: 231 | member.cost = self.mutations_processed[x][1] 232 | # print('\nFitness: {}'.format(member.cost)) 233 | existing_member = True 234 | break 235 | 236 | evasion = 0 237 | if not existing_member: 238 | 239 | # First generation calculates all genes, then breeds+mutates 2 members per generation 240 | gene_num += 1 241 | if self.generationNumber == 1: 242 | print('# Calculating fitness for gene {} of {}: {} #'.format(gene_num, len(self.members), 243 | member.code)) 244 | else: 245 | print('# Calculating fitness for child {}: {} #'.format(gene_num, member.code)) 246 | 247 | # Inject children sequences to input object to create four adversarial examples 248 | bin_bytes = f.readfile(sample) 249 | mod_sample = f.rec_mod_files(bin_bytes, actions, member.code, len(member.code) - 1) 250 | 251 | # If adversarial file returns errors, terminate in current generation 252 | if not mod_sample: 253 | return True 254 | 255 | # Collect info to writeCSV function 256 | mod_sample_hash = f.hash_files(mod_sample) 257 | sample_report = {'positives': 1, 'total': 1} 258 | CSV = f.collect_info_CSV(sample, sample_report, len(member.code), member.code, 259 | mod_sample_hash, f.hash_files(sample)) 260 | 261 | # Analyze functionality results 262 | if cfg.file.getboolean('aimed', 'integrityCheck'): 263 | funcional, url_sandbox = i.malware_analysis(mod_sample, useVT, CSV) 264 | else: 265 | # When f.batch_functionality_test() is used instead of online verification 266 | funcional, url_sandbox = True, "www.no_integrity_test.com" 267 | 268 | # Analyze detection results 269 | if funcional: 270 | # print('Running detection for gene:', member.code) 271 | detected, _ = i.malware_detection(mod_sample, scanner) 272 | mutation_name = str(len(member.code)) + '_m.exe' 273 | evasion = i.save_file_database(detected, mutation_name, url_sandbox, CSV, scanner) 274 | self.new_evasions += evasion 275 | 276 | # Calculate difference between original sample and mutation 277 | self.diff_samples = f.get_difference(sample, mod_sample) 278 | diff_adjusted = round(self.diff_samples / 100000, 3) # Constant empirically defined 279 | 280 | # Set cost to adversarial instances 281 | member.calcCost(detected, self.generationNumber, diff_adjusted) 282 | else: 283 | # Send empty when corrupt 284 | member.calcCost('', self.generationNumber, 0) 285 | self.corrupt_mutations += 1 286 | 287 | self.mutations_processed.append([member.code, member.cost, evasion]) 288 | 289 | print('Sequence: {} – Fitness: {}\n'.format(member.code, member.cost)) 290 | 291 | if self.new_evasions: 292 | print('# Evasive mutations found: {} #'.format(self.new_evasions)) 293 | print('# Corrupt mutations found: {} #\n'.format(self.corrupt_mutations)) 294 | 295 | # Termination: number of evasions achieved or number of generations reach termination defined 296 | files_expected = cfg.file.getint('aimed', 'advFilesExpected') 297 | termination_per_generation = files_expected ** 2 if files_expected >= 10 else self.rounds 298 | if self.generationNumber == termination_per_generation: # self.new_evasions >= files_expected or 299 | return True 300 | 301 | self.generationNumber += 1 302 | return False 303 | 304 | def _generation_uap(self, actions): 305 | 306 | # Set UseVT to VirusTotal report 307 | useVT = cfg.file.getboolean('remote', 'useVT') 308 | 309 | # Call selection before breeding 310 | self.selection() 311 | 312 | # Breeding & mutating and adding children to the members list for Selection afterwards 313 | self.breed() 314 | 315 | # Calculate size of directory 316 | files_exp_set = os.listdir(EXPLORATION_SET) 317 | size_exp_set = len(files_exp_set) 318 | 319 | gene_num = 0 320 | scanner = cfg.file['aimed']['model'] 321 | for member in self.members: 322 | existing_member = False 323 | 324 | # If mutation was processed retrieve fitness value & avoid processing again 325 | for x in range(len(self.mutations_processed)): 326 | if self.mutations_processed[x][0] == member.code: 327 | member.cost = self.mutations_processed[x][1] 328 | # print('\nFitness: {}'.format(member.cost)) 329 | existing_member = True 330 | break 331 | 332 | if not existing_member: 333 | 334 | # First generation calculates all genes, then breeds+mutates 2 members per generation 335 | gene_num += 1 336 | if self.generationNumber == 1: 337 | print('# Calculating fitness for gene {} of {}: {} #'.format(gene_num, len(self.members), 338 | member.code)) 339 | else: 340 | print('# Calculating fitness for child {}: {} #'.format(gene_num, member.code)) 341 | 342 | # Picking sequentially each file from source folder 343 | current_file = 1 344 | evasions_in_generation = 0 345 | for each_sample in tqdm(sorted(os.listdir(EXPLORATION_SET))): 346 | 347 | # Convert selected sample into binaries 348 | sample = os.path.join(EXPLORATION_SET, each_sample) 349 | bin_bytes = f.readfile(sample) 350 | 351 | # Inject children sequences to input file to create four adversarial examples 352 | mod_sample = f.rec_mod_files(bin_bytes, actions, member.code, len(member.code) - 1) 353 | 354 | # If adversarial example returns errors, terminate in current generation 355 | if not mod_sample: 356 | os.rename(os.path.join(EXPLORATION_SET, each_sample), EXPLORATION_SET + 'LIEF_Error_' + each_sample) 357 | return True 358 | 359 | # Collect info to writeCSV function 360 | mod_sample_hash = f.hash_files(mod_sample) 361 | sample_report = {'positives': 1, 'total': 1} 362 | CSV = f.collect_info_CSV(sample, sample_report, len(member.code), member.code, 363 | mod_sample_hash, f.hash_files(sample)) 364 | 365 | # Analyze functionality results 366 | if cfg.file.getboolean('aimed', 'integrityCheck'): 367 | funcional, url_sandbox = i.malware_analysis(mod_sample, useVT, CSV) 368 | else: 369 | # When f.batch_functionality_test() is used instead of online verification 370 | funcional, url_sandbox = True, "www.no_integrity_test.com" 371 | 372 | # Analyze detection results 373 | if funcional: 374 | # print('Running detection for gene:', member.code) 375 | detected, score = i.malware_detection(mod_sample, scanner, verbose=False) 376 | mutation_name = str(len(member.code)) + '_m.exe' 377 | self.new_evasions += i.save_file_database(detected, mutation_name, url_sandbox, CSV, scanner, 378 | verbose=False) 379 | 380 | # Calculate difference between original sample and mutation 381 | self.diff_samples = f.get_difference(sample, mod_sample) 382 | diff_adjusted = round(self.diff_samples / 100000, 3) # Constant empirically defined 383 | 384 | # Set cost to adversarial instances 385 | member.calcCost(detected, self.generationNumber, diff_adjusted, size_dir=size_exp_set, 386 | conf_rate=score, search_UAP=True) 387 | 388 | if not detected: 389 | evasions_in_generation += 1 390 | else: 391 | # Send empty when corrupt 392 | member.calcCost('', self.generationNumber, 0, size_dir=size_exp_set, search_UAP=True) 393 | self.corrupt_mutations += 1 394 | 395 | current_file += 1 396 | 397 | # Check if member has potential to be UAP 398 | if evasions_in_generation >= 20: 399 | self.potential_uap.append([member.code, member.cost, evasions_in_generation]) 400 | 401 | self.mutations_processed.append([member.code, member.cost, evasions_in_generation]) 402 | 403 | print('\nSequence: {} – Fitness: {} - Evasions: {}\n'.format(member.code, round(member.cost, 4), 404 | evasions_in_generation)) 405 | 406 | if self.potential_uap: 407 | print('# Potential UAP candidates found: {} #'.format(len(self.potential_uap))) 408 | 409 | # Termination: number of evasions achieved or number of generations reach termination defined 410 | files_expected = cfg.file.getint('aimed', 'advFilesExpected') 411 | termination_per_generation = files_expected ** 2 if files_expected >= 10 else self.rounds 412 | if self.generationNumber == termination_per_generation: 413 | if self.potential_uap: 414 | print("\nUAP candidates:") 415 | for candidate in range(len(self.potential_uap)): 416 | print('Sequence: {} -- Fitness: {} -- Evasions: {}'.format(self.potential_uap[candidate][0], 417 | round(self.potential_uap[candidate][1], 2), 418 | self.potential_uap[candidate][2])) 419 | return True 420 | 421 | self.generationNumber += 1 422 | return False 423 | -------------------------------------------------------------------------------- /License: -------------------------------------------------------------------------------- 1 | Copyright (c) Raphael Labaca Castro 2 | 3 | Mozilla Public License Version 2.0 4 | ================================== 5 | 6 | 1. Definitions 7 | -------------- 8 | 9 | 1.1. "Contributor" 10 | means each individual or legal entity that creates, contributes to 11 | the creation of, or owns Covered Software. 12 | 13 | 1.2. "Contributor Version" 14 | means the combination of the Contributions of others (if any) used 15 | by a Contributor and that particular Contributor's Contribution. 16 | 17 | 1.3. "Contribution" 18 | means Covered Software of a particular Contributor. 19 | 20 | 1.4. "Covered Software" 21 | means Source Code Form to which the initial Contributor has attached 22 | the notice in Exhibit A, the Executable Form of such Source Code 23 | Form, and Modifications of such Source Code Form, in each case 24 | including portions thereof. 25 | 26 | 1.5. "Incompatible With Secondary Licenses" 27 | means 28 | 29 | (a) that the initial Contributor has attached the notice described 30 | in Exhibit B to the Covered Software; or 31 | 32 | (b) that the Covered Software was made available under the terms of 33 | version 1.1 or earlier of the License, but not also under the 34 | terms of a Secondary License. 35 | 36 | 1.6. "Executable Form" 37 | means any form of the work other than Source Code Form. 38 | 39 | 1.7. "Larger Work" 40 | means a work that combines Covered Software with other material, in 41 | a separate file or files, that is not Covered Software. 42 | 43 | 1.8. "License" 44 | means this document. 45 | 46 | 1.9. "Licensable" 47 | means having the right to grant, to the maximum extent possible, 48 | whether at the time of the initial grant or subsequently, any and 49 | all of the rights conveyed by this License. 50 | 51 | 1.10. "Modifications" 52 | means any of the following: 53 | 54 | (a) any file in Source Code Form that results from an addition to, 55 | deletion from, or modification of the contents of Covered 56 | Software; or 57 | 58 | (b) any new file in Source Code Form that contains any Covered 59 | Software. 60 | 61 | 1.11. "Patent Claims" of a Contributor 62 | means any patent claim(s), including without limitation, method, 63 | process, and apparatus claims, in any patent Licensable by such 64 | Contributor that would be infringed, but for the grant of the 65 | License, by the making, using, selling, offering for sale, having 66 | made, import, or transfer of either its Contributions or its 67 | Contributor Version. 68 | 69 | 1.12. "Secondary License" 70 | means either the GNU General Public License, Version 2.0, the GNU 71 | Lesser General Public License, Version 2.1, the GNU Affero General 72 | Public License, Version 3.0, or any later versions of those 73 | licenses. 74 | 75 | 1.13. "Source Code Form" 76 | means the form of the work preferred for making modifications. 77 | 78 | 1.14. "You" (or "Your") 79 | means an individual or a legal entity exercising rights under this 80 | License. For legal entities, "You" includes any entity that 81 | controls, is controlled by, or is under common control with You. For 82 | purposes of this definition, "control" means (a) the power, direct 83 | or indirect, to cause the direction or management of such entity, 84 | whether by contract or otherwise, or (b) ownership of more than 85 | fifty percent (50%) of the outstanding shares or beneficial 86 | ownership of such entity. 87 | 88 | 2. License Grants and Conditions 89 | -------------------------------- 90 | 91 | 2.1. Grants 92 | 93 | Each Contributor hereby grants You a world-wide, royalty-free, 94 | non-exclusive license: 95 | 96 | (a) under intellectual property rights (other than patent or trademark) 97 | Licensable by such Contributor to use, reproduce, make available, 98 | modify, display, perform, distribute, and otherwise exploit its 99 | Contributions, either on an unmodified basis, with Modifications, or 100 | as part of a Larger Work; and 101 | 102 | (b) under Patent Claims of such Contributor to make, use, sell, offer 103 | for sale, have made, import, and otherwise transfer either its 104 | Contributions or its Contributor Version. 105 | 106 | 2.2. Effective Date 107 | 108 | The licenses granted in Section 2.1 with respect to any Contribution 109 | become effective for each Contribution on the date the Contributor first 110 | distributes such Contribution. 111 | 112 | 2.3. Limitations on Grant Scope 113 | 114 | The licenses granted in this Section 2 are the only rights granted under 115 | this License. No additional rights or licenses will be implied from the 116 | distribution or licensing of Covered Software under this License. 117 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 118 | Contributor: 119 | 120 | (a) for any code that a Contributor has removed from Covered Software; 121 | or 122 | 123 | (b) for infringements caused by: (i) Your and any other third party's 124 | modifications of Covered Software, or (ii) the combination of its 125 | Contributions with other software (except as part of its Contributor 126 | Version); or 127 | 128 | (c) under Patent Claims infringed by Covered Software in the absence of 129 | its Contributions. 130 | 131 | This License does not grant any rights in the trademarks, service marks, 132 | or logos of any Contributor (except as may be necessary to comply with 133 | the notice requirements in Section 3.4). 134 | 135 | 2.4. Subsequent Licenses 136 | 137 | No Contributor makes additional grants as a result of Your choice to 138 | distribute the Covered Software under a subsequent version of this 139 | License (see Section 10.2) or under the terms of a Secondary License (if 140 | permitted under the terms of Section 3.3). 141 | 142 | 2.5. Representation 143 | 144 | Each Contributor represents that the Contributor believes its 145 | Contributions are its original creation(s) or it has sufficient rights 146 | to grant the rights to its Contributions conveyed by this License. 147 | 148 | 2.6. Fair Use 149 | 150 | This License is not intended to limit any rights You have under 151 | applicable copyright doctrines of fair use, fair dealing, or other 152 | equivalents. 153 | 154 | 2.7. Conditions 155 | 156 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 157 | in Section 2.1. 158 | 159 | 3. Responsibilities 160 | ------------------- 161 | 162 | 3.1. Distribution of Source Form 163 | 164 | All distribution of Covered Software in Source Code Form, including any 165 | Modifications that You create or to which You contribute, must be under 166 | the terms of this License. You must inform recipients that the Source 167 | Code Form of the Covered Software is governed by the terms of this 168 | License, and how they can obtain a copy of this License. You may not 169 | attempt to alter or restrict the recipients' rights in the Source Code 170 | Form. 171 | 172 | 3.2. Distribution of Executable Form 173 | 174 | If You distribute Covered Software in Executable Form then: 175 | 176 | (a) such Covered Software must also be made available in Source Code 177 | Form, as described in Section 3.1, and You must inform recipients of 178 | the Executable Form how they can obtain a copy of such Source Code 179 | Form by reasonable means in a timely manner, at a charge no more 180 | than the cost of distribution to the recipient; and 181 | 182 | (b) You may distribute such Executable Form under the terms of this 183 | License, or sublicense it under different terms, provided that the 184 | license for the Executable Form does not attempt to limit or alter 185 | the recipients' rights in the Source Code Form under this License. 186 | 187 | 3.3. Distribution of a Larger Work 188 | 189 | You may create and distribute a Larger Work under terms of Your choice, 190 | provided that You also comply with the requirements of this License for 191 | the Covered Software. If the Larger Work is a combination of Covered 192 | Software with a work governed by one or more Secondary Licenses, and the 193 | Covered Software is not Incompatible With Secondary Licenses, this 194 | License permits You to additionally distribute such Covered Software 195 | under the terms of such Secondary License(s), so that the recipient of 196 | the Larger Work may, at their option, further distribute the Covered 197 | Software under the terms of either this License or such Secondary 198 | License(s). 199 | 200 | 3.4. Notices 201 | 202 | You may not remove or alter the substance of any license notices 203 | (including copyright notices, patent notices, disclaimers of warranty, 204 | or limitations of liability) contained within the Source Code Form of 205 | the Covered Software, except that You may alter any license notices to 206 | the extent required to remedy known factual inaccuracies. 207 | 208 | 3.5. Application of Additional Terms 209 | 210 | You may choose to offer, and to charge a fee for, warranty, support, 211 | indemnity or liability obligations to one or more recipients of Covered 212 | Software. However, You may do so only on Your own behalf, and not on 213 | behalf of any Contributor. You must make it absolutely clear that any 214 | such warranty, support, indemnity, or liability obligation is offered by 215 | You alone, and You hereby agree to indemnify every Contributor for any 216 | liability incurred by such Contributor as a result of warranty, support, 217 | indemnity or liability terms You offer. You may include additional 218 | disclaimers of warranty and limitations of liability specific to any 219 | jurisdiction. 220 | 221 | 4. Inability to Comply Due to Statute or Regulation 222 | --------------------------------------------------- 223 | 224 | If it is impossible for You to comply with any of the terms of this 225 | License with respect to some or all of the Covered Software due to 226 | statute, judicial order, or regulation then You must: (a) comply with 227 | the terms of this License to the maximum extent possible; and (b) 228 | describe the limitations and the code they affect. Such description must 229 | be placed in a text file included with all distributions of the Covered 230 | Software under this License. Except to the extent prohibited by statute 231 | or regulation, such description must be sufficiently detailed for a 232 | recipient of ordinary skill to be able to understand it. 233 | 234 | 5. Termination 235 | -------------- 236 | 237 | 5.1. The rights granted under this License will terminate automatically 238 | if You fail to comply with any of its terms. However, if You become 239 | compliant, then the rights granted under this License from a particular 240 | Contributor are reinstated (a) provisionally, unless and until such 241 | Contributor explicitly and finally terminates Your grants, and (b) on an 242 | ongoing basis, if such Contributor fails to notify You of the 243 | non-compliance by some reasonable means prior to 60 days after You have 244 | come back into compliance. Moreover, Your grants from a particular 245 | Contributor are reinstated on an ongoing basis if such Contributor 246 | notifies You of the non-compliance by some reasonable means, this is the 247 | first time You have received notice of non-compliance with this License 248 | from such Contributor, and You become compliant prior to 30 days after 249 | Your receipt of the notice. 250 | 251 | 5.2. If You initiate litigation against any entity by asserting a patent 252 | infringement claim (excluding declaratory judgment actions, 253 | counter-claims, and cross-claims) alleging that a Contributor Version 254 | directly or indirectly infringes any patent, then the rights granted to 255 | You by any and all Contributors for the Covered Software under Section 256 | 2.1 of this License shall terminate. 257 | 258 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 259 | end user license agreements (excluding distributors and resellers) which 260 | have been validly granted by You or Your distributors under this License 261 | prior to termination shall survive termination. 262 | 263 | ************************************************************************ 264 | * * 265 | * 6. Disclaimer of Warranty * 266 | * ------------------------- * 267 | * * 268 | * Covered Software is provided under this License on an "as is" * 269 | * basis, without warranty of any kind, either expressed, implied, or * 270 | * statutory, including, without limitation, warranties that the * 271 | * Covered Software is free of defects, merchantable, fit for a * 272 | * particular purpose or non-infringing. The entire risk as to the * 273 | * quality and performance of the Covered Software is with You. * 274 | * Should any Covered Software prove defective in any respect, You * 275 | * (not any Contributor) assume the cost of any necessary servicing, * 276 | * repair, or correction. This disclaimer of warranty constitutes an * 277 | * essential part of this License. No use of any Covered Software is * 278 | * authorized under this License except under this disclaimer. * 279 | * * 280 | ************************************************************************ 281 | 282 | ************************************************************************ 283 | * * 284 | * 7. Limitation of Liability * 285 | * -------------------------- * 286 | * * 287 | * Under no circumstances and under no legal theory, whether tort * 288 | * (including negligence), contract, or otherwise, shall any * 289 | * Contributor, or anyone who distributes Covered Software as * 290 | * permitted above, be liable to You for any direct, indirect, * 291 | * special, incidental, or consequential damages of any character * 292 | * including, without limitation, damages for lost profits, loss of * 293 | * goodwill, work stoppage, computer failure or malfunction, or any * 294 | * and all other commercial damages or losses, even if such party * 295 | * shall have been informed of the possibility of such damages. This * 296 | * limitation of liability shall not apply to liability for death or * 297 | * personal injury resulting from such party's negligence to the * 298 | * extent applicable law prohibits such limitation. Some * 299 | * jurisdictions do not allow the exclusion or limitation of * 300 | * incidental or consequential damages, so this exclusion and * 301 | * limitation may not apply to You. * 302 | * * 303 | ************************************************************************ 304 | 305 | 8. Litigation 306 | ------------- 307 | 308 | Any litigation relating to this License may be brought only in the 309 | courts of a jurisdiction where the defendant maintains its principal 310 | place of business and such litigation shall be governed by laws of that 311 | jurisdiction, without reference to its conflict-of-law provisions. 312 | Nothing in this Section shall prevent a party's ability to bring 313 | cross-claims or counter-claims. 314 | 315 | 9. Miscellaneous 316 | ---------------- 317 | 318 | This License represents the complete agreement concerning the subject 319 | matter hereof. If any provision of this License is held to be 320 | unenforceable, such provision shall be reformed only to the extent 321 | necessary to make it enforceable. Any law or regulation which provides 322 | that the language of a contract shall be construed against the drafter 323 | shall not be used to construe this License against a Contributor. 324 | 325 | 10. Versions of the License 326 | --------------------------- 327 | 328 | 10.1. New Versions 329 | 330 | Mozilla Foundation is the license steward. Except as provided in Section 331 | 10.3, no one other than the license steward has the right to modify or 332 | publish new versions of this License. Each version will be given a 333 | distinguishing version number. 334 | 335 | 10.2. Effect of New Versions 336 | 337 | You may distribute the Covered Software under the terms of the version 338 | of the License under which You originally received the Covered Software, 339 | or under the terms of any subsequent version published by the license 340 | steward. 341 | 342 | 10.3. Modified Versions 343 | 344 | If you create software not governed by this License, and you want to 345 | create a new license for such software, you may create and use a 346 | modified version of this License if you rename the license and remove 347 | any references to the name of the license steward (except to note that 348 | such modified license differs from this License). 349 | 350 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 351 | Licenses 352 | 353 | If You choose to distribute Source Code Form that is Incompatible With 354 | Secondary Licenses under the terms of this version of the License, the 355 | notice described in Exhibit B of this License must be attached. 356 | 357 | Exhibit A - Source Code Form License Notice 358 | ------------------------------------------- 359 | 360 | This Source Code Form is subject to the terms of the Mozilla Public 361 | License, v. 2.0. If a copy of the MPL was not distributed with this 362 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 363 | 364 | If it is not possible or desirable to put the notice in a particular 365 | file, then You may include the notice in a location (such as a LICENSE 366 | file in a relevant directory) where a recipient would be likely to look 367 | for such a notice. 368 | 369 | You may add additional accurate notices of copyright ownership. 370 | 371 | Exhibit B - "Incompatible With Secondary Licenses" Notice 372 | --------------------------------------------------------- 373 | 374 | This Source Code Form is "Incompatible With Secondary Licenses", as 375 | defined by the Mozilla Public License, v. 2.0. 376 | -------------------------------------------------------------------------------- /src/defense.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import time 4 | import joblib 5 | import numpy as np 6 | import src.config as cfg 7 | import src.functions as f 8 | import lightgbm as lgb 9 | from sklearn.metrics import roc_auc_score 10 | from data.pefeatures import PEFeatureReader 11 | from sklearn.preprocessing import StandardScaler 12 | from sklearn.linear_model import LogisticRegression 13 | 14 | 15 | NPZ_PATH = cfg.file['paths']['npz'] 16 | MODEL_PATH = cfg.file['paths']['model_path'] 17 | VECTORIZED_PATH = cfg.file['paths']['vectorized_path'] 18 | 19 | 20 | class Defense: 21 | 22 | def __init__(self, model, csv_path, features_path, number_examples): 23 | self.model = model 24 | self.csv_path = csv_path 25 | self.features_path = features_path 26 | self.number_examples = number_examples 27 | 28 | @staticmethod 29 | def create_uap_datasets(csv_path, features_path, uap_vector): 30 | """ 31 | Apply UAP to exploration set to validate results from exploration set. 32 | Create two feature-space datasets: 33 | i) original examples (before UAP injection) 34 | ii) adversarial examples (after injecting UAP) 35 | 36 | Input: 37 | csv_path: path to load CSV file with malware examples 38 | features_path: path to save features from examples 39 | uap_vector: UAP vector calculated using model and dataset 40 | """ 41 | 42 | # Save features from problem-space malware 43 | f.save_features_malware(csv_path=csv_path, features_path=features_path, pert_vector=uap_vector) 44 | 45 | @staticmethod 46 | def extract_perturbation_from_features(features_path): 47 | """ 48 | Extract noise / perturbation from features of adversarial examples 49 | subtracted from the original malware it was generated from. 50 | i) Load original and adversarial datasets (features) 51 | ii) Subtract original - adversarial leaving only noise 52 | 53 | Input: 54 | features_path: path to load features from examples 55 | """ 56 | # Load datasets of original & adversarial examples to extract noise 57 | adv_examples = np.load(features_path + 'adv_examples_uap_compress.npz') 58 | original_malware = np.load(features_path + 'orig_files_uap_compress.npz') 59 | 60 | # Extract features 61 | features_adv_examples = np.array(adv_examples['features']) 62 | features_original = np.array(original_malware['features']) 63 | 64 | # Calculate noise / perturbation based on adversarial - original malware 65 | noise = features_adv_examples - features_original 66 | 67 | return noise 68 | 69 | # DEFENSE 70 | 71 | # Define statistical model to generate adversarial examples 72 | @staticmethod 73 | def attack_statistical_model(malware_input, noise): 74 | """ 75 | Define statistical model to approximate noise / perturbations. 76 | Author: Luis Munoz Gonzalez 77 | 78 | Input: 79 | malware_input: batch of malware examples 80 | noise: perturbation injected to generate adversarial examples 81 | """ 82 | # Number of features 83 | number_features = malware_input.shape[1] 84 | 85 | # Define mean and standard var 86 | meanV = np.zeros(number_features) 87 | stdV = np.zeros(number_features) 88 | 89 | # Assign value to meanV and stdV 90 | for each_feat in range(number_features): 91 | meanV[each_feat] = np.mean(noise[:, each_feat]) 92 | stdV[each_feat] = np.std(noise[:, each_feat]) 93 | 94 | # Generate adversarial examples 95 | adv_ex = np.zeros(malware_input.shape) 96 | for e in range(meanV.size): 97 | rd = np.random.randn(malware_input.shape[0]) * stdV[e] + meanV[e] 98 | adv_ex[:, e] = malware_input[:, e] + rd 99 | 100 | return adv_ex 101 | 102 | def generate_adv_examples_statistical_model(self, malware_batch, noise, npz_path): 103 | """ 104 | Generate adversarial examples using statistical model that approximates 105 | a function based on the noise extracted from the features (in this case Gaussian) 106 | """ 107 | # Load adversarial examples or generate them using statistical model 108 | if os.path.exists(npz_path + 'adversarial_examples_approximated.npz'): 109 | adv_examples_approximated = np.load(npz_path + 'adversarial_examples_approximated.npz') 110 | adv_examples_approximated = adv_examples_approximated['features'] 111 | 112 | else: 113 | # Generate same number of adversarial examples as malicious 114 | adv_examples_approximated = self.attack_statistical_model(malware_batch, noise) 115 | 116 | # Saving adversarial examples generated with the statistical model above 117 | np.savez(npz_path + 'adversarial_examples_approximated.npz', features=adv_examples_approximated) 118 | 119 | print('Adversarial data shape:', adv_examples_approximated.shape) 120 | return adv_examples_approximated 121 | 122 | def adversarial_training(self, noise): 123 | """ 124 | Perform adversarial training using 1/2 of dataset with adversarial 125 | examples + 1/2 with benign (pure) or 1/4 of dataset with adversarial 126 | examples + 1/4 with malicious, and 1/2 of dataset benign (mixed). 127 | Also, train a baseline model to use as a benchmark. 128 | 129 | i) Baseline: Train model with N malware and N benign examples. 130 | 131 | ii) Pure: Adversarially-train model 132 | a) generate N adversarial samples with statistical model 133 | b) Train using N statistically-generated adversarial examples 134 | and N benign examples. 135 | 136 | iii) Mixed: Adversarially-train model 137 | a) generate N/2 adversarial samples with statistical model 138 | b) Train using N/2 statistically-generated adversarial examples, 139 | 50k malware and N benign examples. 140 | 141 | Input: 142 | noise: information sampled from features of adversarial examples and original files 143 | 144 | """ 145 | 146 | # Define size of malicious, benign, and adversarial datasets 147 | # number_examples = 50000 148 | 149 | # Load EMBER data 150 | print('Loading datasets to train baseline & adversarial models: ') 151 | feature_reader = PEFeatureReader() 152 | X_train, y_train = feature_reader.read_vectorized_features(VECTORIZED_PATH, 'train', feature_version=1) 153 | if self.number_examples == 50000: 154 | start_examples = 38800 155 | end_examples = 189000 156 | X_train = X_train[start_examples:end_examples] 157 | y_train = y_train[start_examples:end_examples] 158 | print('Original features shape:', X_train.shape) 159 | 160 | # Filter only malicious 161 | malicious_rows = (y_train == 1) 162 | malware_batch = X_train[malicious_rows] 163 | malware_batch = malware_batch[:self.number_examples] 164 | print('Malicious features shape:', malware_batch.shape) 165 | 166 | # Filter only benign 167 | benign_rows = (y_train == 0) 168 | benign_batch = X_train[benign_rows] 169 | benign_batch = benign_batch[:self.number_examples] 170 | print('Benign features shape:', benign_batch.shape) 171 | 172 | # Generate adversarial examples for adversarial training 173 | adversarial_batch = self.generate_adv_examples_statistical_model(malware_batch, noise, npz_path=NPZ_PATH) 174 | 175 | print('\na) Train LGBM baseline with {} malicious and {} benign files'.format(self.number_examples, 176 | self.number_examples)) 177 | 178 | # Define datasets for baseline training 179 | 180 | # Load model if they already exist, otherwise train them 181 | if os.path.exists(MODEL_PATH + 'ember_model_baseline.pkl'): 182 | lgbm_model_baseline = joblib.load(MODEL_PATH + 'ember_model_baseline.pkl') 183 | 184 | else: 185 | 186 | X_train = np.concatenate((malware_batch, benign_batch), axis=0) 187 | y_train = np.concatenate((np.ones(self.number_examples), np.zeros(self.number_examples)), axis=0) 188 | print('Train data shape for baseline model:', X_train.shape) 189 | 190 | # Create dataset for training 191 | lgbm_dataset = lgb.Dataset(X_train, y_train) 192 | print('Finished preparing dataset for training.') 193 | 194 | # Define parameters & train 195 | start_training = time.time() 196 | params = {"application": "binary"} 197 | lgbm_model_baseline = lgb.train(params, lgbm_dataset) 198 | print('Training time: {} mins'.format(round((time.time() - start_training) / 60, 2))) 199 | 200 | lgbm_model_baseline.save_model(MODEL_PATH + 'ember_model_baseline.txt') 201 | joblib.dump(lgbm_model_baseline, MODEL_PATH + 'ember_model_baseline.pkl') 202 | print('Baseline model saved.') 203 | 204 | print('b) Adversarial train (Pure) LGBM with {} adversarial and {} benign files'.format(self.number_examples, 205 | self.number_examples)) 206 | 207 | # Define dataset for adversarially trained model 'pure' (AEs und benign) 208 | 209 | # Load model if they already exist, otherwise train them 210 | if os.path.exists(MODEL_PATH + 'ember_model_adv_trained_pure.pkl'): 211 | lgbm_model_adv_trained_pure = joblib.load(MODEL_PATH + 'ember_model_adv_trained_pure.pkl') 212 | 213 | else: 214 | 215 | X_train = np.concatenate((adversarial_batch, benign_batch), axis=0) 216 | y_train = np.concatenate((np.ones(self.number_examples), np.zeros(self.number_examples)), axis=0) 217 | print('Train data shape for pure adversarial model:', X_train.shape) 218 | 219 | # Create dataset for training 220 | lgbm_dataset = lgb.Dataset(X_train, y_train) 221 | print('Finished preparing dataset for training.') 222 | 223 | # Define parameters & train 224 | start_training = time.time() 225 | params = {"application": "binary"} 226 | lgbm_model_adv_trained_pure = lgb.train(params, lgbm_dataset) 227 | print('Training time: {} mins'.format(round((time.time() - start_training) / 60, 2))) 228 | 229 | lgbm_model_adv_trained_pure.save_model(MODEL_PATH + 'ember_model_adv_trained_pure.txt') 230 | joblib.dump(lgbm_model_adv_trained_pure, MODEL_PATH + 'ember_model_adv_trained_pure.pkl') 231 | print('Adversarially trained (pure) model saved.') 232 | 233 | # Define dataset for adversarially trained model 'mixed' (AEs + malware und benign) 234 | 235 | print('c) Adversarial train (Mixed) LGBM with {} adversarial, {} malicious, and {} benign files'.format( 236 | int(self.number_examples / 2), int(self.number_examples / 2), self.number_examples)) 237 | 238 | # Load model if they already exist, otherwise train them 239 | if os.path.exists(MODEL_PATH + 'ember_model_adv_trained_mixed.pkl'): 240 | lgbm_model_adv_trained_mixed = joblib.load(MODEL_PATH + 'ember_model_adv_trained_mixed.pkl') 241 | 242 | else: 243 | number_examples_mal_adv = int(self.number_examples / 2) 244 | X_train = np.concatenate( 245 | (malware_batch[:number_examples_mal_adv], adversarial_batch[:number_examples_mal_adv], benign_batch), 246 | axis=0) 247 | y_train = np.concatenate((np.ones(self.number_examples), np.zeros(self.number_examples)), axis=0) 248 | print('Train data shape for mixed adversarial model:', X_train.shape) 249 | 250 | # Create dataset for training 251 | lgbm_dataset = lgb.Dataset(X_train, y_train) 252 | print('Finished preparing dataset for training.') 253 | 254 | # Define params & train | with feature_version = 1 (2351) 255 | start_training = time.time() 256 | params = {"application": "binary"} 257 | lgbm_model_adv_trained_mixed = lgb.train(params, lgbm_dataset) 258 | print('Training time: {} mins'.format(round((time.time() - start_training) / 60, 2))) 259 | 260 | lgbm_model_adv_trained_mixed.save_model(MODEL_PATH + 'ember_model_adv_trained_mixed.txt') 261 | joblib.dump(lgbm_model_adv_trained_mixed, MODEL_PATH + 'ember_model_adv_trained_mixed.pkl') 262 | print('Adversarially trained (mixed) model saved.') 263 | 264 | return lgbm_model_baseline, lgbm_model_adv_trained_pure, lgbm_model_adv_trained_mixed 265 | 266 | def train_logit(self, model_path): 267 | """ 268 | Training a logistic regression model. 269 | 270 | Input: 271 | model_path: path to save & load trained logit model 272 | """ 273 | time_all = time.time() 274 | 275 | # Load EMBER data 276 | print('\nLoading datasets to train LR model: ') 277 | feature_reader = PEFeatureReader() 278 | X_train, y_train, X_test, y_test = feature_reader.read_vectorized_features(VECTORIZED_PATH, feature_version=1) 279 | if self.number_examples == 50000: 280 | start_examples = 38800 281 | end_examples = 189000 282 | X_train = X_train[start_examples:end_examples] 283 | y_train = y_train[start_examples:end_examples] 284 | print('Original features shape:', X_train.shape) 285 | 286 | # Selecting less samples to avoid crashing if working with notebook 287 | # minvalue = 0 288 | # maxvalue = 900000 # Für 100.000 mit AUC 0.94. 289 | # X_train = X_train[minvalue:maxvalue] 290 | # y_train = y_train[minvalue:maxvalue] 291 | # print('Current data shape:', X_train.shape) 292 | 293 | # Filter out unlabeled 294 | train_rows = (y_train != -1) 295 | X_train = X_train[train_rows] 296 | y_train = y_train[train_rows] 297 | print('Filtered features shape:', X_train.shape) 298 | 299 | # If trained data reduced adjust test data 300 | if self.number_examples == 50000: 301 | test_examples = 30000 302 | X_test = X_test[:test_examples] 303 | y_test = y_test[:test_examples] 304 | print('Test features shape:', X_test.shape) 305 | 306 | # Scale data 307 | norm_std_scaler = StandardScaler().fit(X_train) 308 | X_train = norm_std_scaler.transform(X_train) 309 | X_test = norm_std_scaler.transform(X_test) 310 | 311 | # Load the pre-trained logit model 312 | if os.path.exists(model_path + 'logit_ember.pkl'): 313 | clf_LR = joblib.load(model_path + 'logit_ember.pkl') 314 | else: 315 | # Train the model on the dataset 316 | print('Model not found, LR will be trained..') 317 | clf_LR = LogisticRegression(random_state=24) 318 | clf_LR = clf_LR.fit(X_train, y_train) 319 | joblib.dump(clf_LR, model_path + 'logit_ember.pkl') 320 | 321 | # Show processing time in h:m:s 322 | m, s = divmod(time.time() - time_all, 60) 323 | h, m = divmod(m, 60) 324 | print("Time elapsed training logit: %d:%02d:%02d" % (h, m, s)) 325 | 326 | # Calculate predictions with LR model 327 | print("Model {}".format(clf_LR.__class__.__name__)) 328 | y_pred = clf_LR.predict(X_test) 329 | print("ROC-AUC LR:", roc_auc_score(y_test, y_pred)) 330 | 331 | return clf_LR 332 | 333 | @staticmethod 334 | def extract_important_features(model, features_path): 335 | """ 336 | Extract most important features of logit model. 337 | """ 338 | # Get importance weights for LR model 339 | importance = model.coef_ 340 | importance = importance[0] 341 | 342 | # Collect more indexes for features with same weight of importance (excluding features = 0) 343 | repeated_indexes = [] 344 | repeated_values = [] 345 | for i, v in enumerate(importance): 346 | curr_repeated_indexes = [idx for idx in range(len(importance)) if importance[idx] == importance[i]] 347 | if len(curr_repeated_indexes) > 1 and v != 0: 348 | repeated_indexes.append(curr_repeated_indexes) 349 | repeated_values.append(v) 350 | 351 | if repeated_indexes: # Only 46 if 0.0 is included as feature value (same weight) 352 | print(len(repeated_indexes), repeated_indexes) 353 | print(len(repeated_values), repeated_values) 354 | 355 | # Get n important features & indexes 356 | j = 474 # arbitrarily chosen ~20% of 2351 357 | top_j_features = sorted(importance, reverse=True)[:j] 358 | indices = [list(importance).index(value) for value in top_j_features] 359 | print('\nIdentified top 20% features based on feature importances of LR.') 360 | # print('Top {} values: {}'.format(j, top_j_features)) 361 | # print('Top {} indexes: {}'.format(j, indices)) 362 | print() 363 | 364 | np.savez(features_path + 'top_features_LR_importances_indices', indices) 365 | 366 | return indices 367 | 368 | def train_lgbm_important_features(self, features_path): 369 | """ 370 | Train the LightGBM model with the EMBER dataset using only the top 371 | features based on the Logit most important features. 372 | 373 | Input: 374 | features_path: path to save features from examples 375 | """ 376 | 377 | # Load EMBER data 378 | print('Loading datasets to train LGBM with feature reduction: ') 379 | feature_reader = PEFeatureReader() 380 | X_train, y_train, X_test, _ = feature_reader.read_vectorized_features(VECTORIZED_PATH, feature_version=1) 381 | if self.number_examples == 50000: 382 | start_examples = 38800 383 | end_examples = 189000 384 | X_train = X_train[start_examples:end_examples] 385 | y_train = y_train[start_examples:end_examples] 386 | print('Original features shape:', X_train.shape) 387 | 388 | # Filter unlabeled data 389 | train_rows = (y_train != -1) 390 | X_train = X_train[train_rows] 391 | y_train = y_train[train_rows] 392 | print('Filtered features shape:', X_train.shape) 393 | 394 | # If trained data reduced adjust test data 395 | if self.number_examples == 50000: 396 | test_examples = 30000 397 | X_test = X_test[:test_examples] 398 | print('Test features shape:', X_test.shape) 399 | 400 | # Use only 20% of highest importance features based on Logit model 401 | top_features_LR_importances = np.load(features_path + 'top_features_LR_importances_indices.npz') 402 | top_features_LR_importances = top_features_LR_importances['arr_0'] 403 | X_train = X_train[:, top_features_LR_importances] 404 | print('Top 20% features shape:', X_train.shape) 405 | 406 | # Create dataset for training 407 | lgbm_dataset = lgb.Dataset(X_train, y_train) 408 | # print('Finished preparing dataset for training.\n') 409 | 410 | # Define parameters & train 411 | start_training = time.time() 412 | params = {"application": "binary"} 413 | lgbm_model_reduced = lgb.train(params, lgbm_dataset) 414 | print('Training time: {} mins'.format(round((time.time() - start_training) / 60, 2))) 415 | 416 | lgbm_model_reduced.save_model(MODEL_PATH + 'ember_model_reduced.txt') 417 | joblib.dump(lgbm_model_reduced, MODEL_PATH + 'ember_model_reduced.pkl') 418 | print('Feature-reduced model saved.\n') 419 | 420 | return lgbm_model_reduced 421 | 422 | def train_feature_reduction(self, model_path, features_path): 423 | """ 424 | i) Train a Logistic Regression Model 425 | ii) Extract feature_importances from Logit 426 | iii) Retrain LGBM with 20% most important features 427 | """ 428 | # Train Logit 429 | clf_LR = self.train_logit(model_path=model_path) 430 | 431 | # Extract feature_importances from Logit 432 | self.extract_important_features(model=clf_LR, features_path=features_path) 433 | 434 | # Retrain LGBM with 20% most important features 435 | self.train_lgbm_important_features(features_path=features_path) 436 | -------------------------------------------------------------------------------- /data/pefeatures.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' Extracts some basic features from PE files. Many of the features 3 | implemented have been used in previously published works. For more information, 4 | check out the following resources: 5 | * Schultz, et al., 2001: http://128.59.14.66/sites/default/files/binaryeval-ieeesp01.pdf 6 | * Kolter and Maloof, 2006: http://www.jmlr.org/papers/volume7/kolter06a/kolter06a.pdf 7 | * Shafiq et al., 2009: https://www.researchgate.net/profile/Fauzan_Mirza/publication/242084613_A_Framework_for_Efficient_Mining_of_Structural_Information_to_Detect_Zero-Day_Malicious_Portable_Executables/links/0c96052e191668c3d5000000.pdf 8 | * Raman, 2012: http://2012.infosecsouthwest.com/files/speaker_materials/ISSW2012_Selecting_Features_to_Classify_Malware.pdf 9 | * Saxe and Berlin, 2015: https://arxiv.org/pdf/1508.03096.pdf 10 | 11 | It may be useful to do feature selection to reduce this set of features to a meaningful set 12 | for your modeling problem. 13 | 14 | Source: https://github.com/endgameinc/ember 15 | ''' 16 | import os 17 | import re 18 | import lief 19 | import hashlib 20 | import numpy as np 21 | from sklearn.feature_extraction import FeatureHasher 22 | 23 | LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.') 24 | LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or ( int(LIEF_MAJOR)==0 and int(LIEF_MINOR) >= 10 ) 25 | 26 | 27 | class FeatureType(object): 28 | ''' Base class from which each feature type may inherit ''' 29 | 30 | name = '' 31 | dim = 0 32 | 33 | def __repr__(self): 34 | return '{}({})'.format(self.name, self.dim) 35 | 36 | def raw_features(self, bytez, lief_binary): 37 | ''' Generate a JSON-able representation of the file ''' 38 | raise (NotImplementedError) 39 | 40 | def process_raw_features(self, raw_obj): 41 | ''' Generate a feature vector from the raw features ''' 42 | raise (NotImplementedError) 43 | 44 | def feature_vector(self, bytez, lief_binary): 45 | ''' Directly calculate the feature vector from the sample itself. This should only be implemented differently 46 | if there are significant speedups to be gained from combining the two functions. ''' 47 | return self.process_raw_features(self.raw_features(bytez, lief_binary)) 48 | 49 | 50 | class ByteHistogram(FeatureType): 51 | ''' Byte histogram (count + non-normalized) over the entire binary file ''' 52 | 53 | name = 'histogram' 54 | dim = 256 55 | 56 | def __init__(self): 57 | super(FeatureType, self).__init__() 58 | 59 | def raw_features(self, bytez, lief_binary): 60 | counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256) 61 | return counts.tolist() 62 | 63 | def process_raw_features(self, raw_obj): 64 | counts = np.array(raw_obj, dtype=np.float32) 65 | sum = counts.sum() 66 | normalized = counts / sum 67 | return normalized 68 | 69 | 70 | class ByteEntropyHistogram(FeatureType): 71 | ''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015). 72 | This roughly approximates the joint probability of byte value and local entropy. 73 | See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info. 74 | ''' 75 | 76 | name = 'byteentropy' 77 | dim = 256 78 | 79 | def __init__(self, step=1024, window=2048): 80 | super(FeatureType, self).__init__() 81 | self.window = window 82 | self.step = step 83 | 84 | def _entropy_bin_counts(self, block): 85 | # coarse histogram, 16 bytes per bin 86 | c = np.bincount(block >> 4, minlength=16) # 16-bin histogram 87 | p = c.astype(np.float32) / self.window 88 | wh = np.where(c)[0] 89 | H = np.sum(-p[wh] * np.log2( 90 | p[wh])) * 2 # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits) 91 | 92 | Hbin = int(H * 2) # up to 16 bins (max entropy is 8 bits) 93 | if Hbin == 16: # handle entropy = 8.0 bits 94 | Hbin = 15 95 | 96 | return Hbin, c 97 | 98 | def raw_features(self, bytez, lief_binary): 99 | output = np.zeros((16, 16), dtype=np.int) 100 | a = np.frombuffer(bytez, dtype=np.uint8) 101 | if a.shape[0] < self.window: 102 | Hbin, c = self._entropy_bin_counts(a) 103 | output[Hbin, :] += c 104 | else: 105 | # strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html 106 | shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window) 107 | strides = a.strides + (a.strides[-1],) 108 | blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :] 109 | 110 | # from the blocks, compute histogram 111 | for block in blocks: 112 | Hbin, c = self._entropy_bin_counts(block) 113 | output[Hbin, :] += c 114 | 115 | return output.flatten().tolist() 116 | 117 | def process_raw_features(self, raw_obj): 118 | counts = np.array(raw_obj, dtype=np.float32) 119 | sum = counts.sum() 120 | normalized = counts / sum 121 | return normalized 122 | 123 | 124 | class SectionInfo(FeatureType): 125 | ''' Information about section names, sizes and entropy. Uses hashing trick 126 | to summarize all this section info into a feature vector. 127 | ''' 128 | 129 | name = 'section' 130 | dim = 5 + 50 + 50 + 50 + 50 + 50 131 | 132 | def __init__(self): 133 | super(FeatureType, self).__init__() 134 | 135 | @staticmethod 136 | def _properties(s): 137 | return [str(c).split('.')[-1] for c in s.characteristics_lists] 138 | 139 | def raw_features(self, bytez, lief_binary): 140 | if lief_binary is None: 141 | return {"entry": "", "sections": []} 142 | 143 | # properties of entry point, or if invalid, the first executable section 144 | try: 145 | entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name 146 | except lief.not_found: 147 | # bad entry point, let's find the first executable section 148 | entry_section = "" 149 | for s in lief_binary.sections: 150 | if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists: 151 | entry_section = s.name 152 | break 153 | 154 | raw_obj = {"entry": entry_section} 155 | raw_obj["sections"] = [{ 156 | 'name': s.name, 157 | 'size': s.size, 158 | 'entropy': s.entropy, 159 | 'vsize': s.virtual_size, 160 | 'props': self._properties(s) 161 | } for s in lief_binary.sections] 162 | return raw_obj 163 | 164 | def process_raw_features(self, raw_obj): 165 | sections = raw_obj['sections'] 166 | general = [ 167 | len(sections), # total number of sections 168 | # number of sections with nonzero size 169 | sum(1 for s in sections if s['size'] == 0), 170 | # number of sections with an empty name 171 | sum(1 for s in sections if s['name'] == ""), 172 | # number of RX 173 | sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']), 174 | # number of W 175 | sum(1 for s in sections if 'MEM_WRITE' in s['props']) 176 | ] 177 | # gross characteristics of each section 178 | section_sizes = [(s['name'], s['size']) for s in sections] 179 | section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0] 180 | section_entropy = [(s['name'], s['entropy']) for s in sections] 181 | section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0] 182 | section_vsize = [(s['name'], s['vsize']) for s in sections] 183 | section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0] 184 | entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0] 185 | characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']] 186 | characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0] 187 | 188 | return np.hstack([ 189 | general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed, 190 | characteristics_hashed 191 | ]).astype(np.float32) 192 | 193 | 194 | class ImportsInfo(FeatureType): 195 | ''' Information about imported libraries and functions from the 196 | import address table. Note that the total number of imported 197 | functions is contained in GeneralFileInfo. 198 | ''' 199 | 200 | name = 'imports' 201 | dim = 1280 202 | 203 | def __init__(self): 204 | super(FeatureType, self).__init__() 205 | 206 | def raw_features(self, bytez, lief_binary): 207 | imports = {} 208 | if lief_binary is None: 209 | return imports 210 | 211 | for lib in lief_binary.imports: 212 | if lib.name not in imports: 213 | imports[lib.name] = [] # libraries can be duplicated in listing, extend instead of overwrite 214 | 215 | # Clipping assumes there are diminishing returns on the discriminatory power of imported functions 216 | # beyond the first 10000 characters, and this will help limit the dataset size 217 | for entry in lib.entries: 218 | if entry.is_ordinal: 219 | imports[lib.name].append("ordinal" + str(entry.ordinal)) 220 | else: 221 | imports[lib.name].append(entry.name[:10000]) 222 | 223 | return imports 224 | 225 | def process_raw_features(self, raw_obj): 226 | # unique libraries 227 | libraries = list(set([l.lower() for l in raw_obj.keys()])) 228 | libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0] 229 | 230 | # A string like "kernel32.dll:CreateFileMappingA" for each imported function 231 | imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist] 232 | imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0] 233 | 234 | # Two separate elements: libraries (alone) and fully-qualified names of imported functions 235 | return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32) 236 | 237 | 238 | class ExportsInfo(FeatureType): 239 | ''' Information about exported functions. Note that the total number of exported 240 | functions is contained in GeneralFileInfo. 241 | ''' 242 | 243 | name = 'exports' 244 | dim = 128 245 | 246 | def __init__(self): 247 | super(FeatureType, self).__init__() 248 | 249 | def raw_features(self, bytez, lief_binary): 250 | if lief_binary is None: 251 | return [] 252 | 253 | # Clipping assumes there are diminishing returns on the discriminatory power of exports beyond 254 | # the first 10000 characters, and this will help limit the dataset size 255 | if LIEF_EXPORT_OBJECT: 256 | # export is an object with .name attribute (0.10.0 and later) 257 | clipped_exports = [export.name[:10000] for export in lief_binary.exported_functions] 258 | else: 259 | # export is a string (LIEF 0.9.0 and earlier) 260 | clipped_exports = [export[:10000] for export in lief_binary.exported_functions] 261 | 262 | 263 | return clipped_exports 264 | 265 | def process_raw_features(self, raw_obj): 266 | exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0] 267 | return exports_hashed.astype(np.float32) 268 | 269 | 270 | class GeneralFileInfo(FeatureType): 271 | ''' General information about the file ''' 272 | 273 | name = 'general' 274 | dim = 10 275 | 276 | def __init__(self): 277 | super(FeatureType, self).__init__() 278 | 279 | def raw_features(self, bytez, lief_binary): 280 | if lief_binary is None: 281 | return { 282 | 'size': len(bytez), 283 | 'vsize': 0, 284 | 'has_debug': 0, 285 | 'exports': 0, 286 | 'imports': 0, 287 | 'has_relocations': 0, 288 | 'has_resources': 0, 289 | 'has_signature': 0, 290 | 'has_tls': 0, 291 | 'symbols': 0 292 | } 293 | 294 | return { 295 | 'size': len(bytez), 296 | 'vsize': lief_binary.virtual_size, 297 | 'has_debug': int(lief_binary.has_debug), 298 | 'exports': len(lief_binary.exported_functions), 299 | 'imports': len(lief_binary.imported_functions), 300 | 'has_relocations': int(lief_binary.has_relocations), 301 | 'has_resources': int(lief_binary.has_resources), 302 | 'has_signature': int(lief_binary.has_signature), 303 | 'has_tls': int(lief_binary.has_tls), 304 | 'symbols': len(lief_binary.symbols), 305 | } 306 | 307 | def process_raw_features(self, raw_obj): 308 | return np.asarray([ 309 | raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'], 310 | raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'], 311 | raw_obj['symbols'] 312 | ], 313 | dtype=np.float32) 314 | 315 | 316 | class HeaderFileInfo(FeatureType): 317 | ''' Machine, architecure, OS, linker and other information extracted from header ''' 318 | 319 | name = 'header' 320 | dim = 62 321 | 322 | def __init__(self): 323 | super(FeatureType, self).__init__() 324 | 325 | def raw_features(self, bytez, lief_binary): 326 | raw_obj = {} 327 | raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []} 328 | raw_obj['optional'] = { 329 | 'subsystem': "", 330 | 'dll_characteristics': [], 331 | 'magic': "", 332 | 'major_image_version': 0, 333 | 'minor_image_version': 0, 334 | 'major_linker_version': 0, 335 | 'minor_linker_version': 0, 336 | 'major_operating_system_version': 0, 337 | 'minor_operating_system_version': 0, 338 | 'major_subsystem_version': 0, 339 | 'minor_subsystem_version': 0, 340 | 'sizeof_code': 0, 341 | 'sizeof_headers': 0, 342 | 'sizeof_heap_commit': 0 343 | } 344 | if lief_binary is None: 345 | return raw_obj 346 | 347 | raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps 348 | raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1] 349 | raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list] 350 | raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1] 351 | raw_obj['optional']['dll_characteristics'] = [ 352 | str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists 353 | ] 354 | raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1] 355 | raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version 356 | raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version 357 | raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version 358 | raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version 359 | raw_obj['optional'][ 360 | 'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version 361 | raw_obj['optional'][ 362 | 'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version 363 | raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version 364 | raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version 365 | raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code 366 | raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers 367 | raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit 368 | return raw_obj 369 | 370 | def process_raw_features(self, raw_obj): 371 | return np.hstack([ 372 | raw_obj['coff']['timestamp'], 373 | FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0], 374 | FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0], 375 | FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0], 376 | FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0], 377 | FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0], 378 | raw_obj['optional']['major_image_version'], 379 | raw_obj['optional']['minor_image_version'], 380 | raw_obj['optional']['major_linker_version'], 381 | raw_obj['optional']['minor_linker_version'], 382 | raw_obj['optional']['major_operating_system_version'], 383 | raw_obj['optional']['minor_operating_system_version'], 384 | raw_obj['optional']['major_subsystem_version'], 385 | raw_obj['optional']['minor_subsystem_version'], 386 | raw_obj['optional']['sizeof_code'], 387 | raw_obj['optional']['sizeof_headers'], 388 | raw_obj['optional']['sizeof_heap_commit'], 389 | ]).astype(np.float32) 390 | 391 | 392 | class StringExtractor(FeatureType): 393 | ''' Extracts strings from raw byte stream ''' 394 | 395 | name = 'strings' 396 | dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1 397 | 398 | def __init__(self): 399 | super(FeatureType, self).__init__() 400 | # all consecutive runs of 0x20 - 0x7f that are 5+ characters 401 | self._allstrings = re.compile(b'[\x20-\x7f]{5,}') 402 | # occurances of the string 'C:\'. Not actually extracting the path 403 | self._paths = re.compile(b'c:\\\\', re.IGNORECASE) 404 | # occurances of http:// or https://. Not actually extracting the URLs 405 | self._urls = re.compile(b'https?://', re.IGNORECASE) 406 | # occurances of the string prefix HKEY_. No actually extracting registry names 407 | self._registry = re.compile(b'HKEY_') 408 | # crude evidence of an MZ header (dropper?) somewhere in the byte stream 409 | self._mz = re.compile(b'MZ') 410 | 411 | def raw_features(self, bytez, lief_binary): 412 | allstrings = self._allstrings.findall(bytez) 413 | if allstrings: 414 | # statistics about strings: 415 | string_lengths = [len(s) for s in allstrings] 416 | avlength = sum(string_lengths) / len(string_lengths) 417 | # map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive 418 | as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)] 419 | c = np.bincount(as_shifted_string, minlength=96) # histogram count 420 | # distribution of characters in printable strings 421 | csum = c.sum() 422 | p = c.astype(np.float32) / csum 423 | wh = np.where(c)[0] 424 | H = np.sum(-p[wh] * np.log2(p[wh])) # entropy 425 | else: 426 | avlength = 0 427 | c = np.zeros((96,), dtype=np.float32) 428 | H = 0 429 | csum = 0 430 | 431 | return { 432 | 'numstrings': len(allstrings), 433 | 'avlength': avlength, 434 | 'printabledist': c.tolist(), # store non-normalized histogram 435 | 'printables': int(csum), 436 | 'entropy': float(H), 437 | 'paths': len(self._paths.findall(bytez)), 438 | 'urls': len(self._urls.findall(bytez)), 439 | 'registry': len(self._registry.findall(bytez)), 440 | 'MZ': len(self._mz.findall(bytez)) 441 | } 442 | 443 | def process_raw_features(self, raw_obj): 444 | hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0 445 | return np.hstack([ 446 | raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'], 447 | np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'], 448 | raw_obj['registry'], raw_obj['MZ'] 449 | ]).astype(np.float32) 450 | 451 | 452 | class DataDirectories(FeatureType): 453 | ''' Extracts size and virtual address of the first 15 data directories ''' 454 | 455 | name = 'datadirectories' 456 | dim = 15 * 2 457 | 458 | def __init__(self): 459 | super(FeatureType, self).__init__() 460 | self._name_order = [ 461 | "EXPORT_TABLE", "IMPORT_TABLE", "RESOURCE_TABLE", "EXCEPTION_TABLE", "CERTIFICATE_TABLE", 462 | "BASE_RELOCATION_TABLE", "DEBUG", "ARCHITECTURE", "GLOBAL_PTR", "TLS_TABLE", "LOAD_CONFIG_TABLE", 463 | "BOUND_IMPORT", "IAT", "DELAY_IMPORT_DESCRIPTOR", "CLR_RUNTIME_HEADER" 464 | ] 465 | 466 | def raw_features(self, bytez, lief_binary): 467 | output = [] 468 | if lief_binary is None: 469 | return output 470 | 471 | for data_directory in lief_binary.data_directories: 472 | output.append({ 473 | "name": str(data_directory.type).replace("DATA_DIRECTORY.", ""), 474 | "size": data_directory.size, 475 | "virtual_address": data_directory.rva 476 | }) 477 | return output 478 | 479 | def process_raw_features(self, raw_obj): 480 | features = np.zeros(2 * len(self._name_order), dtype=np.float32) 481 | for i in range(len(self._name_order)): 482 | if i < len(raw_obj): 483 | features[2 * i] = raw_obj[i]["size"] 484 | features[2 * i + 1] = raw_obj[i]["virtual_address"] 485 | return features 486 | 487 | 488 | class PEFeatureExtractor(object): 489 | ''' Extract useful features from a PE file, and return as a vector of fixed size. ''' 490 | 491 | def __init__(self, feature_version=1): 492 | self.features = [ 493 | ByteHistogram(), 494 | ByteEntropyHistogram(), 495 | StringExtractor(), 496 | GeneralFileInfo(), 497 | HeaderFileInfo(), 498 | SectionInfo(), 499 | ImportsInfo(), 500 | ExportsInfo() 501 | ] 502 | if feature_version == 1: 503 | if not lief.__version__.startswith("0.8.3"): 504 | pass 505 | #print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75") 506 | #print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") 507 | #print(f"WARNING: in the feature calculations.") 508 | elif feature_version == 2: 509 | self.features.append(DataDirectories()) 510 | if not lief.__version__.startswith("0.9.0"): 511 | pass 512 | #print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-") 513 | #print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") 514 | #print(f"WARNING: in the feature calculations.") 515 | else: 516 | raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}") 517 | self.dim = sum([fe.dim for fe in self.features]) 518 | 519 | def raw_features(self, bytez): 520 | lief_errors = (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, 521 | RuntimeError) 522 | try: 523 | lief_binary = lief.PE.parse(list(bytez)) 524 | except lief_errors as e: 525 | print("lief error: ", str(e)) 526 | lief_binary = None 527 | except Exception: # everything else (KeyboardInterrupt, SystemExit, ValueError): 528 | raise 529 | 530 | features = {"sha256": hashlib.sha256(bytez).hexdigest()} 531 | features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features}) 532 | return features 533 | 534 | def process_raw_features(self, raw_obj): 535 | feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features] 536 | return np.hstack(feature_vectors).astype(np.float32) 537 | 538 | def feature_vector(self, bytez): 539 | return self.process_raw_features(self.raw_features(bytez)) 540 | 541 | 542 | class PEFeatureReader(object): 543 | ''' Load features from vectorized .dat files. ''' 544 | 545 | def read_vectorized_features(self, data_dir, subset=None, feature_version=1): 546 | """ 547 | Read vectorized features into memory mapped numpy arrays 548 | """ 549 | if subset is not None and subset not in ["train", "test"]: 550 | return None 551 | 552 | extractor = PEFeatureExtractor(feature_version) 553 | ndim = extractor.dim 554 | X_train = None 555 | y_train = None 556 | X_test = None 557 | y_test = None 558 | 559 | if subset is None or subset == "train": 560 | X_train_path = os.path.join(data_dir, "X_train.dat") 561 | y_train_path = os.path.join(data_dir, "y_train.dat") 562 | y_train = np.memmap(y_train_path, dtype=np.float32, mode="r") 563 | N = y_train.shape[0] 564 | X_train = np.memmap(X_train_path, dtype=np.float32, mode="r", shape=(N, ndim)) 565 | if subset == "train": 566 | return X_train, y_train 567 | 568 | if subset is None or subset == "test": 569 | X_test_path = os.path.join(data_dir, "X_test.dat") 570 | y_test_path = os.path.join(data_dir, "y_test.dat") 571 | y_test = np.memmap(y_test_path, dtype=np.float32, mode="r") 572 | N = y_test.shape[0] 573 | X_test = np.memmap(X_test_path, dtype=np.float32, mode="r", shape=(N, ndim)) 574 | if subset == "test": 575 | return X_test, y_test 576 | 577 | return X_train, y_train, X_test, y_test 578 | -------------------------------------------------------------------------------- /src/functions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import sys 4 | import lief 5 | import json 6 | import time 7 | import random 8 | import shutil 9 | import joblib 10 | import zipfile 11 | import lightgbm 12 | import requests 13 | import subprocess 14 | import numpy as np 15 | import pandas as pd 16 | import src.config as cfg 17 | from hashlib import sha256 18 | import data.manipulate as m 19 | from datetime import timedelta 20 | from data.pefeatures import PEFeatureExtractor 21 | 22 | VT_API_KEY = cfg.file['apiKeys']['vt'] 23 | HA_API_KEY = cfg.file['apiKeys']['ha'] 24 | MD_API_KEY = cfg.file['apiKeys']['md'] 25 | 26 | EXCEPTIONS = (MemoryError, lief.bad_file, lief.bad_format, lief.not_found) 27 | 28 | 29 | def time_me(start_time): 30 | """ 31 | Timer returning output in following format HH:MM:SS 32 | """ 33 | # Show total time in hh:mm:ss 34 | minutes, seconds = divmod(time.time() - start_time, 60) 35 | hours, minutes = divmod(minutes, 60) 36 | print('\nProcessing time: %02d:%02d:%02d\n' % (hours, minutes, seconds)) 37 | return '%02d:%02d:%02d' % (hours, minutes, seconds) 38 | 39 | 40 | def readfile(filename): 41 | """ 42 | Convert file into bytes 43 | """ 44 | 45 | with open(filename, "rb") as b: 46 | b_bytes = b.read() 47 | return b_bytes 48 | 49 | 50 | def unzip_file(zipped_path, unzipped_path): 51 | """ 52 | Unzip downloaded malware with standard industry password 53 | """ 54 | 55 | for item in os.listdir(zipped_path): 56 | if item.endswith(".zip"): 57 | full_path = zipped_path + item 58 | zip_file = zipfile.ZipFile(full_path, 'r') 59 | zip_file.setpassword(b"infected") # Industry password for malware 60 | zip_file.extractall(unzipped_path) 61 | zip_file.close() 62 | 63 | 64 | def hash_files(filename): 65 | """ 66 | Return SHA256 of a file 67 | """ 68 | 69 | h = sha256() 70 | with open(filename, 'rb', buffering=0) as f: 71 | for b in iter(lambda: f.read(128 * 1024), b''): 72 | h.update(b) 73 | return h.hexdigest() 74 | 75 | 76 | def rename_files(files_path): 77 | """ 78 | Rename files with SHA256 value 79 | """ 80 | 81 | for item in os.listdir(files_path): 82 | files = files_path + item 83 | sha = hash_files(files) 84 | os.rename(files, files_path + sha) 85 | 86 | 87 | def url_ok(url): 88 | """ 89 | Check URL status 90 | """ 91 | 92 | r = requests.get(url, timeout=10) 93 | return r.status_code 94 | 95 | 96 | def create_sequential_actions(size_of_actions, n): 97 | """ 98 | Return vector filled with sequential perturbations 99 | e.g: 100 | for n = 4 and size_of_actions = 10 101 | 102 | [0, 0, 0, 0] 103 | [0, 0, 0, 1] 104 | [0, 0, 0, 2] 105 | ... 106 | [9, 9, 9, 9] 107 | """ 108 | 109 | sequential_actions = [] 110 | string_format_n = "{0:0" + str(n) + "}" 111 | cases_generated = [string_format_n.format(i) for i in range(size_of_actions ** n)] 112 | 113 | for i in range(len(cases_generated)): 114 | sequential_actions.append([int(s) for s in cases_generated[i]]) 115 | 116 | return sequential_actions 117 | 118 | 119 | def create_random_actions(size_of_actions, n): 120 | """ 121 | Return vector filled with random perturbations 122 | """ 123 | 124 | random.seed() 125 | random_actions = random.sample(range(size_of_actions), n) 126 | return random_actions 127 | 128 | 129 | def actions_vector(actions_dict): 130 | """ 131 | Creating a dict with all available perturbations 132 | """ 133 | 134 | actions = {i: act for i, act in enumerate(actions_dict)} 135 | return actions 136 | 137 | 138 | def build_bytes(input_bytes, total_number_perturbations): 139 | """ 140 | Compile a malware mutation after perturbations are injected 141 | 142 | Input: 143 | input_bytes: input malware in bytes 144 | total_number_perturbations: number of perturbations injected to keep track in name 145 | """ 146 | 147 | try: 148 | new_binary = lief.PE.parse(list(input_bytes)) 149 | builder = lief.PE.Builder(new_binary) 150 | builder.build_imports(True) 151 | builder.patch_imports(True) 152 | builder.build() 153 | name_mod_file = cfg.file['paths']['mod'] + str(total_number_perturbations) + '_m.exe' 154 | builder.write(name_mod_file) 155 | 156 | except EXCEPTIONS as e: 157 | print("When parsing & building returned the following error:", str(e)) 158 | return None 159 | 160 | return name_mod_file 161 | 162 | 163 | def rec_mod_files(input_bytes, actions, chosen_actions, inject_perturbation): 164 | """ 165 | Recursive function to inject perturbations to input malware sample 166 | 167 | Input: 168 | input_bytes: input malware in bytes 169 | actions: all possible perturbations 170 | chosen_actions: vector of perturbations to inject 171 | inject_perturbation: perturbation being injected on this iteration 172 | """ 173 | 174 | if inject_perturbation == -1: 175 | return build_bytes(input_bytes, len(chosen_actions)) 176 | else: 177 | try: 178 | manipulator = m.MalwareManipulator(input_bytes) 179 | next_action = actions[chosen_actions[inject_perturbation]] 180 | inject_action = manipulator.__getattribute__(next_action) 181 | mod_bytes = inject_action(input_bytes) 182 | 183 | except EXCEPTIONS as e: 184 | print('When injecting perturbation returned the error: ', e) 185 | return None 186 | 187 | return rec_mod_files(mod_bytes, actions, chosen_actions, inject_perturbation - 1) 188 | 189 | 190 | # CALCULATE DIFFERENCE BETWEEN TWO PEs 191 | 192 | 193 | def get_difference(sample1, sample2): 194 | """ 195 | Calculate the difference between two PE: 196 | 197 | Input: 198 | sample1: original sample S 199 | sample2: mutation S' 200 | """ 201 | 202 | s1_bytes = readfile(sample1) 203 | s2_bytes = readfile(sample2) 204 | try: 205 | # Use -n to compare only until smallest file ends to avoid EOF message 206 | compare_samples = subprocess.Popen( 207 | ['cmp', '-l', '-n' + str(min(len(s1_bytes), len(s2_bytes))), sample1, sample2], 208 | stdout=subprocess.PIPE) 209 | out_compare_samples, err_compare_samples = compare_samples.communicate() 210 | 211 | except subprocess.CalledProcessError as e: 212 | raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) 213 | 214 | compare_samples.kill() 215 | return len(out_compare_samples) 216 | 217 | 218 | # API MANAGEMENT: VIRUS TOTAL & HYBRID ANALYSIS & METADEFENDER 219 | 220 | 221 | def check_API_key(api_key): 222 | """ 223 | Check whether an API key is given before using an external service 224 | """ 225 | if api_key == '': 226 | sys.exit('\nProvide an API key to use this service.\n') 227 | return 1 228 | 229 | 230 | def get_user_quotas_VT(): 231 | """ 232 | APIv3 implementation to get request quotas of user 233 | """ 234 | 235 | url = 'https://www.virustotal.com/api/v3/users' 236 | headers = {'x-apikey': VT_API_KEY, 'Accept': 'application/json'} 237 | response = requests.get(url + '/{}'.format(VT_API_KEY), headers=headers) 238 | json_response = response.json() 239 | request_rate = json_response['data']['attributes']['quotas']['api_requests_hourly']['allowed'] 240 | return request_rate 241 | 242 | 243 | def send_VT(sample): 244 | """ 245 | APIv3 implementation to send a file for analysis using VirusTotal 246 | 247 | Input: 248 | sample: malware that will be labeled 249 | """ 250 | 251 | # Check API key given 252 | check_API_key(VT_API_KEY) 253 | 254 | url = 'https://www.virustotal.com/api/v3/files' 255 | headers = {'x-apikey': VT_API_KEY, 'Accept': 'application/json'} 256 | files = {'file': (sample, open(sample, 'rb'))} 257 | response = requests.post(url, headers=headers, files=files) 258 | json_response = response.json() 259 | return json_response 260 | 261 | 262 | def get_report_VT(file_hash, rescan=False): 263 | """ 264 | APIv3 implementation to retrieve report from a file analyzed using VirusTotal 265 | 266 | Input: 267 | file_hash: sample of malware to retrieve 268 | rescan: boolean option to rescan file in case it is previously detected 269 | """ 270 | 271 | # Check API key given 272 | check_API_key(VT_API_KEY) 273 | 274 | requests_allowed_minute = get_user_quotas_VT() / 60 275 | url = 'https://www.virustotal.com/api/v3/files' 276 | headers = {'x-apikey': VT_API_KEY, 'Accept': 'application/json'} 277 | querystring = {'limit': '10'} 278 | 279 | if rescan: 280 | response = requests.post(url + '/{}/analyse'.format(file_hash), headers=headers, params=querystring) 281 | return response.json() 282 | else: 283 | attempts = 0 284 | while attempts < requests_allowed_minute: 285 | 286 | response = requests.get(url + '/{}'.format(file_hash), headers=headers, params=querystring) 287 | 288 | if response.status_code == 404: 289 | time_to_sleep = (1 if 60 / requests_allowed_minute < 1 else 60 / requests_allowed_minute) 290 | print("Sample is not on VirusTotal. Waiting {} s..".format(time_to_sleep)) 291 | time.sleep(time_to_sleep) 292 | 293 | elif response.status_code != 200: 294 | print( 295 | '\nVirusTotal returned server error {} while requesting scan report. Probably API issues. Exiting ' 296 | 'application until solved.\n'.format( 297 | response.status_code)) 298 | sys.exit() 299 | 300 | else: 301 | json_response = response.json() 302 | return json_response 303 | 304 | attempts += 1 305 | 306 | sys.exit("VirusTotal processing is taking too long. Timing out.") 307 | 308 | 309 | def get_report_VT_ext(file_hash, json_dest_path): 310 | """ 311 | Detecting malware samples using VirusTotal APIv3 (remote) 312 | 313 | Input: 314 | sample_report: the number of VT detections to use as benchmark 315 | """ 316 | 317 | print('\nDetection for sample:', file_hash) 318 | 319 | try: 320 | # Get VirusTotal detections - Rescan: False 321 | report = get_report_VT(file_hash, False) 322 | report_stats = report['data']['attributes']['last_analysis_stats'] 323 | report_results = report['data']['attributes']['last_analysis_results'] 324 | 325 | # Check reported status of sample 326 | detected = report_stats['malicious'] 327 | undetected = report_stats['undetected'] 328 | total = detected + undetected 329 | print('\nDetected by {} out of {} engines. \n'.format(detected, total)) 330 | 331 | # Print only engines detecting new sample 332 | engines_detecting = {key: val for key, val in report_results.items() if val['category'] == 'malicious'} 333 | print(list(engines_detecting.keys())) 334 | 335 | # Label as malicious if most of engines do so 336 | detection = (1 if detected / total > 0.5 else 0) 337 | 338 | # Provide link to sample detections report 339 | # print('\n{}'.format(report['data']['links']['self'])) 340 | 341 | # Save json file 342 | with open(json_dest_path.format(file_hash), 'w') as json_file: 343 | json.dump(report, json_file) 344 | 345 | return detection 346 | 347 | except (requests.ConnectionError, requests.Timeout, requests.ConnectTimeout) as e: 348 | print('Connection issues or API requests threshold reached: {}'.format(e)) 349 | 350 | 351 | def send_MD(sample): 352 | """ 353 | APIv2 implementation to send a file for analysis using MetaDefender 354 | 355 | Input: 356 | sample: malware that will be labeled 357 | """ 358 | 359 | # Check API key given 360 | check_API_key(MD_API_KEY) 361 | 362 | headers = {'apikey': MD_API_KEY} 363 | files = {'file': (sample, open(sample, 'rb'))} 364 | response = requests.post('https://api.metadefender.com/v2/file', headers=headers, files=files) 365 | json_response = response.json() 366 | return json_response 367 | 368 | 369 | def get_report_MD(data_id): 370 | """ 371 | APIv2 implementation to retrieve report from a file analyzed using MetaDefender 372 | """ 373 | 374 | # Check API key given 375 | check_API_key(MD_API_KEY) 376 | 377 | headers = {'apikey': MD_API_KEY} 378 | response = requests.get('https://api.metadefender.com/v2/file/' + data_id, headers=headers) 379 | json_response = response.json() 380 | return json_response 381 | 382 | 383 | def send_HA(sample, environment_id): 384 | """ 385 | APIv2 implementation to send a file for analysis using Hybrid Analysis 386 | 387 | Input: 388 | sample: malware that will be labeled 389 | environment_id: OS used to run malware sample (ID = 120: Windows7 - 64 bits) 390 | """ 391 | 392 | # Check API key given 393 | check_API_key(HA_API_KEY) 394 | 395 | f = open(sample, 'rb') 396 | headers = {'User-agent': 'Falcon Sandbox', 'api-key': HA_API_KEY} 397 | data = {'environment_id': environment_id, 'no_share_third_party': True, 'allow_community_access': False} 398 | files = {'file': f} 399 | 400 | try: 401 | submitUrl = 'https://www.hybrid-analysis.com/api/v2/submit/file' 402 | res = requests.post(submitUrl, headers=headers, data=data, files=files) 403 | if res.status_code == 200 or res.status_code == 201: 404 | print("\nFile successfully submitted to analysis: {}".format(os.path.basename(sample))) 405 | f.close() 406 | return res.json() 407 | else: 408 | print("Error code: {}, returned when uploading: {}".format(res.status_code, f.name)) 409 | return res.status_code 410 | 411 | except requests.exceptions.HTTPError as err: 412 | print(err.read()) 413 | err.print_exc() 414 | 415 | 416 | def get_report_HA(file_hash): 417 | """ 418 | APIv2 implementation to retrieve report from a file analyzed using Hybrid Analysis 419 | """ 420 | 421 | # Check API key given 422 | check_API_key(HA_API_KEY) 423 | 424 | # Adjusted the API from params to headers to send the api-key 425 | headers = {'User-agent': 'Falcon Sandbox', 'api-key': HA_API_KEY} 426 | # EnvironmentID = 120 and needs to be implemented as '%3A120' 427 | res = requests.get('https://www.hybrid-analysis.com/api/v2/report/' + file_hash + '%3A120' + '/summary', 428 | headers=headers) 429 | json_res = res.json() 430 | return json_res 431 | 432 | 433 | # API MANAGEMENT: LOCAL SANDBOX (CUCKOO) 434 | 435 | 436 | def send_local_sandbox(sample): 437 | """ 438 | API implementation to send a file for analysis using Cuckoo sandbox (local) 439 | 440 | Input: 441 | sample: malware that will be labeled 442 | """ 443 | 444 | submitUrl = "http://localhost:8090/tasks/create/file" 445 | data = {'timeout': '30'} 446 | with open(sample, "rb") as sample: 447 | files = {"file": ("new_mutation", sample)} 448 | r = requests.post(submitUrl, data=data, files=files) 449 | 450 | try: 451 | if r.status_code == 200: 452 | # print("\nFile successfully submitted to analysis: {}".format(os.path.basename(sample))) 453 | sample.close() 454 | return r.json() 455 | else: 456 | print("Error code: {}, returned when submitting.".format(r.status_code)) 457 | return r.status_code 458 | 459 | except requests.exceptions.HTTPError as err: 460 | print(err.read()) 461 | err.print_exc() 462 | 463 | 464 | def get_report_local_sandbox(id_report, option): 465 | """ 466 | API implementation to retrieve report from a file analyzed using the Cuckoo sandbox 467 | """ 468 | 469 | # Options: view = short report | report = extensive report 470 | if option == 'view': 471 | r = requests.get('http://localhost:8090/tasks/view/' + str(id_report)) 472 | else: 473 | r = requests.get('http://localhost:8090/tasks/report/' + str(id_report)) 474 | return r.json() 475 | 476 | 477 | # DATABASE.CSV CREATION & UPDATE 478 | 479 | 480 | def collect_info_CSV(sample, sample_report, number_perturbations, chosen_actions, mod_sample_hash, hash_sample): 481 | """ 482 | Collect info on dict and prepare to save on CSV 483 | 484 | Input: 485 | sample: name of malware mutation 486 | sample_report: detection rate of mutation (positive/total detections) 487 | number_perturbations: number of perturbations injected 488 | chosen_actions: vector with perturbations injected to create malware mutation 489 | mod_sample_hash: SHA256 value of malware mutation 490 | hash_sample: SHA256 value of original malware provided as input 491 | """ 492 | 493 | CSV = {'Original_File': sample, 'OF_Detections': str(sample_report['positives']) + '/' + str( 494 | sample_report['total']), 'Perturbations': str(number_perturbations), 495 | 'Perturbations_Injected': chosen_actions[:number_perturbations], 'Mod_File_Hash': mod_sample_hash, 496 | 'Original_File_Hash': hash_sample} 497 | return CSV 498 | 499 | 500 | def write_dict_CSV(csv_file, CSV, fields): 501 | """ 502 | Function to save dict into CSV file 503 | 504 | Input: 505 | csv_file: CSV file to create 506 | CSV: dict with values to store 507 | fields: pre-defined column names 508 | """ 509 | 510 | try: 511 | if not os.path.isfile(csv_file): 512 | with open(csv_file, 'w') as fi: 513 | writer = csv.DictWriter(fi, fieldnames=fields) 514 | writer.writeheader() 515 | writer.writerow(CSV) 516 | else: 517 | with open(csv_file, 'a') as fi: 518 | writer = csv.DictWriter(fi, fieldnames=fields, extrasaction='ignore') 519 | writer.writerow(CSV) 520 | 521 | except IOError as err: 522 | print("Exception: {}".format(err)) 523 | 524 | 525 | # TABLE CREATION FOR COMPARISON BETWEEN ARMED & AIMED 526 | 527 | 528 | def time_to_seconds(data, new_df_cols=None, original_csv_cols=None): 529 | """ 530 | Convert time in data.csv from hh:mm:ss to s 531 | 532 | Input: 533 | data: input CSV file 534 | new_df_cols: columns for new dataframe used for format conversion (optional) 535 | original_csv_cols: pre-defined columns in original input CSV (optional) 536 | """ 537 | 538 | if new_df_cols is None: 539 | new_df_cols = ['Perturbations', 'Files M1', 'Time M1', 'Time M2'] 540 | if original_csv_cols is None: 541 | original_csv_cols = ['Sample', 'Perturbations', 'Module 1', 'Time M1', 'Files M1', 'Corr M1', 'Module 2', 542 | 'Time M2', 'Files M2', 'Corr M2', 'Total Time'] 543 | time_seconds = pd.DataFrame(columns=new_df_cols) 544 | csv_panda = pd.read_csv(data, names=original_csv_cols) 545 | for i in range(1, len(csv_panda)): 546 | x = time.strptime(csv_panda['Time M1'][i].split(',')[0], '%H:%M:%S') 547 | y = time.strptime(csv_panda['Time M2'][i].split(',')[0], '%H:%M:%S') 548 | time_seconds.loc[len(time_seconds)] = [csv_panda['Perturbations'][i], csv_panda['Files M1'][i], 549 | timedelta(hours=x.tm_hour, minutes=x.tm_min, 550 | seconds=x.tm_sec).total_seconds(), 551 | timedelta(hours=y.tm_hour, minutes=y.tm_min, 552 | seconds=y.tm_sec).total_seconds()] 553 | 554 | return time_seconds 555 | 556 | 557 | def sum_times(data, col_time): 558 | """ 559 | Calculate from data the sum of time elapsed processing ARMED & AIMED 560 | 561 | Input: 562 | data: pd.Dataframe with time information 563 | col_time: column with time values (e.g., col_time='Time M1') 564 | """ 565 | 566 | sum_time_elapsed = {} 567 | for i in range(1, len(data)): 568 | if (data['Files M1'][i]) in sum_time_elapsed.keys(): 569 | ext_sum = sum_time_elapsed[(data['Files M1'][i])] + data[col_time][i] 570 | sum_time_elapsed.update({(data['Files M1'][i]): ext_sum}) 571 | else: 572 | sum_time_elapsed[(data['Files M1'][i])] = data['Time M1'][i] 573 | 574 | return sum_time_elapsed 575 | 576 | 577 | def average_times(number_files_grouped_AXMED, sum_times_files_ARMED, sum_times_files_AIMED, csv_file=None, save=False): 578 | """ 579 | Create dict with number of mutations generated and time processed in average 580 | for ARMED (column 1) and AIMED (column 2) 581 | 582 | Input: 583 | number_files_grouped_AXMED: group with sum of all instances of times with same number of files created 584 | sum_times_files_ARMED: sum of all instances of times with same number of files created for ARMED 585 | sum_times_files_AIMED: sum of all instances of times with same number of files created for AIMED 586 | csv_file: input csv file (optional) 587 | save: boolean value to confirm whether to save results (default: False) 588 | """ 589 | 590 | average_times_ARMED = {} 591 | average_times_AIMED = {} 592 | for k, v in sum_times_files_ARMED.items(): 593 | average_times_ARMED.update({k: round(sum_times_files_ARMED[k] / number_files_grouped_AXMED[k])}) 594 | average_times_AIMED.update({k: round(sum_times_files_AIMED[k] / number_files_grouped_AXMED[k])}) 595 | 596 | # Convert all items, keys (strings) and values (pd.Dataframe) to int 597 | average_times_ARMED = {int(k): int(v) for k, v in average_times_ARMED.items()} 598 | average_times_AIMED = {int(k): int(v) for k, v in average_times_AIMED.items()} 599 | 600 | list_avg_times_ARMED = sorted(average_times_ARMED.items()) 601 | list_avg_times_AIMED = sorted(average_times_AIMED.items()) 602 | 603 | if save: 604 | with open('support_armed_times.csv', 'a') as f: 605 | writer = csv.writer(f) 606 | for row_i in list_avg_times_ARMED: 607 | writer.writerow(row_i) 608 | f.close() 609 | 610 | # Remove existing file to avoid adding duplicated data 611 | if os.path.exists(csv_file): 612 | os.remove(csv_file) 613 | 614 | i = 0 615 | with open('support_armed_times.csv', 'r') as fin: 616 | with open(csv_file, 'a') as file_out: 617 | writer = csv.writer(file_out) 618 | for row in csv.reader(fin): 619 | writer.writerow(row + [list_avg_times_AIMED[i][1]]) 620 | i += 1 621 | fin.close() 622 | file_out.close() 623 | 624 | # armed_times.csv is used as support to create csv_file with ARMED & AIMED times 625 | os.remove('support_armed_times.csv') 626 | 627 | return average_times_ARMED, average_times_AIMED 628 | 629 | 630 | def comparing_AXMED(): 631 | """ 632 | Create a CSV to be used directly in LaTeX with comparison between 633 | processing times of ARMED & AIMED 634 | """ 635 | 636 | # Prepare data to compare processing times between ARMED & AIMED 637 | AXMED_seconds = time_to_seconds('db/compare.csv') 638 | 639 | # Sum all instances of times with same number of files created 640 | sum_times_files_ARMED = sum_times(AXMED_seconds, 'Time M1') 641 | sum_times_files_AIMED = sum_times(AXMED_seconds, 'Time M2') 642 | 643 | # Group all lines with the same value of files / mutations generated 644 | number_files_grouped_AXMED = AXMED_seconds.groupby('Files M1').size() 645 | 646 | # Retrieve a csv file with 3 columns: 1) files generated 2) times ARMED and 3) times AIMED 647 | average_times(number_files_grouped_AXMED, sum_times_files_ARMED, sum_times_files_AIMED, 648 | csv_file='db/compare_armed_aimed.csv', save=True) 649 | 650 | 651 | # GET MALWARE SCORE USING PRE-TRAINED MODELS 652 | 653 | def load_av(filename): 654 | """ 655 | Load pre-saved model (lgb or pickle). 656 | 657 | Input: 658 | filename: model with .pkl extension 659 | """ 660 | # Convert to joblib (.pkl) if lgb model (.txt) in the input 661 | if filename.endswith(".txt"): 662 | bst = lightgbm.Booster(model_file=filename) 663 | new_filename = filename[:-4] + ".pkl" 664 | joblib.dump(bst, new_filename) 665 | loaded_model = joblib.load(new_filename) 666 | else: 667 | loaded_model = joblib.load(filename) 668 | return loaded_model 669 | 670 | 671 | def get_score_local(sample_bytes, model, top_features_path=''): 672 | """ 673 | Extract features from malware and get score using pre-saved model 674 | Ver.2: PEFeatureExtractor from EMBER dataset with 2381 features 675 | 676 | Input: 677 | sample_bytes: malware example 678 | model: ML-based model (i.e., LightGBM) 679 | top_features_path: path to NPZ with index of top features (Optional) 680 | """ 681 | 682 | # Handle LightGBM exception if different version of features used during training & testing (v1=2351 & v2=2381) 683 | if model.num_feature() == 2351: 684 | feature_extractor = PEFeatureExtractor(feature_version=1) 685 | elif model.num_feature() == 2381: 686 | feature_extractor = PEFeatureExtractor(feature_version=2) 687 | else: 688 | sys.exit('Number of features known are v1:2351 and v2:2381. Features detected: {}'.format(model.num_feature())) 689 | 690 | # Extract features of adversarial example 691 | features = feature_extractor.feature_vector(sample_bytes) 692 | 693 | # Optionally: Get score using reduced number of features (based on top20% of highest modified features of Logit model) 694 | if len(top_features_path) > 0: 695 | top_features = np.load(top_features_path) 696 | features = features[top_features['arr_0']] 697 | 698 | # Get malicious score from a single malware example 699 | score = model.predict([features])[0] 700 | 701 | return score 702 | 703 | 704 | # UAP related modules: Convert problem-space to feature-space dataset, exploration set, etc. 705 | 706 | 707 | def save_features_malware(csv_path, features_path, pert_vector): 708 | """ 709 | Saving features from adversarial examples (=evasion) of problem-space malware 710 | 711 | Input: 712 | csv_path: path to the CSV file 713 | features_path: path to the extracted features from files 714 | pert_vector: perturbation vector injected 715 | """ 716 | 717 | feature_extractor = PEFeatureExtractor() 718 | orig_features = [] 719 | mod_features = [] 720 | 721 | with open(csv_path + 'evasion.csv') as csv_file: 722 | 723 | dict_read = csv.DictReader(csv_file) 724 | for row in dict_read: 725 | 726 | # Ignoring malware with LIEF errors 727 | if row['Original_File'][21:25] == 'LIEF': 728 | continue 729 | 730 | # print(row['Original_File'], row['Manipulated_File']) 731 | 732 | try: 733 | orig_bin_bytes = readfile(row['Original_File']) 734 | mod_bin_bytes = readfile(row['Manipulated_File']) 735 | except OSError as e: 736 | print(e) 737 | 738 | orig_current_features = np.array(feature_extractor.feature_vector(orig_bin_bytes), dtype=np.float32) 739 | mod_current_features = np.array(feature_extractor.feature_vector(mod_bin_bytes), dtype=np.float32) 740 | 741 | orig_features.append(orig_current_features) 742 | mod_features.append(mod_current_features) 743 | 744 | orig_features = np.array(orig_features) 745 | mod_features = np.array(mod_features) 746 | 747 | np.savez_compressed(features_path + 'orig_files_uap_compress'.format(pert_vector), features=orig_features) 748 | np.savez_compressed(features_path + 'adv_examples_uap_compress'.format(pert_vector), features=mod_features) 749 | 750 | orig_loaded = np.load(features_path + 'orig_files_uap_compress.npz'.format(pert_vector)) 751 | mod_loaded = np.load(features_path + 'adv_examples_uap_compress.npz'.format(pert_vector)) 752 | 753 | # print('\nFeatures from original & modified problem-space malware saved.\n') 754 | # print('Orig:', orig_loaded['features'], len(orig_loaded['features'])) 755 | # print('Mod:', mod_loaded['features'], len(mod_loaded['features'])) 756 | 757 | assert np.array_equal(orig_features, orig_loaded['features']), 'Different sizes!' 758 | assert np.array_equal(mod_features, mod_loaded['features']), 'Different sizes!' 759 | 760 | 761 | # print('\nCompressed and original versions are equal in size: Checked') 762 | 763 | 764 | def copy_files_csv(csv_path='', dest_path=''): 765 | """ 766 | Copying specific examples by parsing (adversarial | detected | corrupt) 767 | CSV files. 768 | 769 | Input: 770 | csv_path: path to the CSV file 771 | dest_path: destination path 772 | """ 773 | # Ensure directory exist 774 | os.makedirs(os.path.dirname(dest_path), exist_ok=True) 775 | 776 | file_counter = 0 777 | with open(csv_path) as csv_file: 778 | dict_read = csv.DictReader(csv_file) 779 | for row in dict_read: 780 | print(row['Original_File'], row['Manipulated_File']) 781 | shutil.copyfile(row['Original_File'], dest_path + str(file_counter)) 782 | file_counter += 1 783 | 784 | 785 | def create_exploration_validation_set(o_path='', e_path='', v_path='', threshold=0.9, model='data/lgbm_ember.pkl'): 786 | """ 787 | Create exploration & validation sets to use during greedy-process of UAP search. 788 | 789 | Input: 790 | o_path: origin path, pool of malware to sample from 791 | e_path: exploration path 792 | v_path: validation path 793 | """ 794 | exploration_files = 100 795 | validation_files = 1000 796 | 797 | # Ensure directories exist 798 | os.makedirs(os.path.dirname(e_path), exist_ok=True) 799 | os.makedirs(os.path.dirname(v_path), exist_ok=True) 800 | 801 | # Load LightGBM model 802 | av_model = load_av(model) 803 | 804 | number_samples = 0 805 | for sample in os.listdir(o_path): 806 | bin_bytes = readfile(o_path + sample) 807 | score = get_score_local(bin_bytes, av_model) 808 | 809 | # Collect {validation_files} *different* detected samples for UAP process 810 | if number_samples < validation_files: 811 | if score > threshold: 812 | number_samples += 1 813 | shutil.copyfile(o_path + sample, v_path + sample) 814 | print('Validation set: Malware {} detected & copied ({})'.format(number_samples, round(score, 2))) 815 | 816 | # Collect {exploration_files} *different* detected samples for UAP process 817 | elif validation_files <= number_samples < exploration_files + validation_files: 818 | if score > threshold and sample not in os.listdir(v_path): 819 | number_samples += 1 820 | shutil.copyfile(o_path + sample, e_path + sample) 821 | print('Exploration set: Malware {} detected & copied ({})'.format(number_samples-validation_files, round(score, 2))) 822 | else: 823 | sys.exit('\nExploration & Validation sets correctly created.') 824 | 825 | 826 | def clean_cuckoo_analyses_folder(path='.cuckoo/storage/analyses'): 827 | """ 828 | Delete analysis folder to spare storage. 829 | 830 | Input: 831 | path: [default] path to Cuckoo 832 | """ 833 | 834 | path_analyses = os.path.join(os.path.expanduser('~'), path) 835 | for file in sorted(os.listdir(path_analyses))[:-5]: 836 | filename = os.path.join(path_analyses, file) 837 | shutil.rmtree(filename) 838 | return 839 | -------------------------------------------------------------------------------- /src/rl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # AIMED-RL: Automatic Intelligent Malware modifications to Evade Detection - with Reinforcement Learning 4 | import numpy as np 5 | import random 6 | import distutils 7 | from enum import Enum 8 | import csv 9 | 10 | import chainer 11 | import chainerrl 12 | import chainer.links as L 13 | import chainer.functions as F 14 | from chainerrl.initializers import LeCunNormal 15 | from chainerrl.distribution import SoftmaxDistribution 16 | from chainerrl.action_value import DiscreteActionValue 17 | from chainerrl.optimizers import rmsprop_async 18 | from chainerrl import links 19 | from chainerrl.replay_buffers import EpisodicReplayBuffer 20 | 21 | import gym 22 | from gym import spaces 23 | 24 | import src.functions as f 25 | from collections import OrderedDict 26 | import data.pefeatures as pefeatures 27 | import os 28 | from datetime import datetime 29 | import data.manipulate as m 30 | from time import time 31 | 32 | ACTIONS = f.actions_vector(m.ACTION_TABLE.keys()) 33 | 34 | # Reward Weight Distributions: 35 | STANDARD_WEIGHTS = [0.33, 0.33, 0.33] 36 | INCREMENT_WEIGHTS = [0.5, 0.2, 0.3] # decrease importance of similarity 37 | 38 | 39 | # Class based on OpenAI Gym Environment and Gym Malware (https://github.com/endgameinc/gym-malware/) 40 | class MalwareEnv(gym.Env): 41 | metadata = {'render.modes': ['human']} 42 | 43 | def __init__(self, malware_list, detection_function, analysis_function): 44 | random.seed(PARAM_DICT["seed"]) 45 | self.malware_list = malware_list 46 | self.used_malware = [] 47 | self.actions = ACTIONS 48 | self.action_space = spaces.Discrete(len(ACTIONS)) 49 | self.actions_taken = [] 50 | 51 | self.max_turns = PARAM_DICT["max_turns"] 52 | 53 | self.strategy_reset = PARAM_DICT["strategy_reset"] # Reset actions after half 54 | self.strategy_inject = PARAM_DICT["strategy_inject"] # Inject actions randomly after half 55 | assert not (self.strategy_reset and self.strategy_inject) # Only one strategy possible at a time 56 | 57 | self.turns = 0 58 | 59 | # Reward weights: 60 | self.reward_weights = PARAM_DICT["weights"] 61 | assert np.sum(self.reward_weights) <= 1.0 # Sum of weights must not be bigger than 1 62 | self.detected_weight = self.reward_weights[0] 63 | self.similarity_weight = self.reward_weights[1] 64 | self.distance_weight = self.reward_weights[2] 65 | 66 | self.reward_punishment = PARAM_DICT["reward_punishment"] 67 | 68 | self.history = OrderedDict() 69 | 70 | # Functions: 71 | self.detector_function = detection_function 72 | self.functionality_function = lambda: (random.randint(0, 10), 0) # analysis_function 73 | self.similarity_function = f.get_difference 74 | 75 | # Malware Features: 76 | self.feature_extractor = pefeatures.PEFeatureExtractor() 77 | self.current_malware = None 78 | self.current_manipulation = None 79 | self.original_bytez = None 80 | self.feature_space = None 81 | 82 | def step(self, action_index): 83 | # Apply strategy - Reset actions after half: 84 | if self.strategy_reset and self.turns == self.max_turns / 2: 85 | self.actions_taken = [] 86 | self.current_manipulation = self.current_malware 87 | self.history[self.current_malware].append("RESET") 88 | 89 | self.turns += 1 90 | 91 | # Manipulate Malware: 92 | try: 93 | bytez = self._take_action(action_index) 94 | # Update State 95 | self.feature_space = self.feature_extractor.feature_vector(bytez) 96 | except Exception as e: # PE Manipulation Errors 97 | print('Exception raised:', e) 98 | reward = 0 99 | episode_over = True 100 | return np.asarray(self.feature_space), reward, episode_over, \ 101 | {"detected": False, "detected_confidence": 0, "errored": True} 102 | 103 | reward, detected, detected_confidence = self._calculate_reward() 104 | 105 | max_turns_reached = False 106 | if self.turns >= self.max_turns: 107 | # reward = 0.0 Temporarily removed for last training 108 | max_turns_reached = True 109 | 110 | # print("Detected: ", detected) 111 | # print("Max_turns_reached:", max_turns_reached) 112 | 113 | episode_over = max_turns_reached or not detected 114 | # print("Episode over: ", episode_over) 115 | 116 | # Info may not be used by RL_Agent 117 | info = {"detected": detected, "detected_confidence": detected_confidence, "errored": False} 118 | return np.asarray(self.feature_space), reward, episode_over, info 119 | 120 | def get_random_action(self): 121 | action = random.randrange(0, len(self.actions)) 122 | print("Random action: " + self.actions[action]) 123 | return action 124 | 125 | def _take_action(self, action_index): 126 | action = self.actions[action_index] 127 | if self.strategy_inject and self.turns > self.max_turns / 2: 128 | random_index = random.randrange(start=0, stop=len(self.actions_taken) - 1, step=1) 129 | self.actions_taken[random_index] = action_index 130 | self.history[self.current_malware][random_index] = action 131 | else: 132 | self.actions_taken.append(action_index) 133 | self.history[self.current_malware].append(action) 134 | 135 | # Check to avoid 'list out of index' exceptions 136 | if self.turns <= self.max_turns / 2: 137 | self.current_manipulation = f.rec_mod_files(input_bytes=self.original_bytez, 138 | actions=self.actions, 139 | chosen_actions=self.actions_taken, 140 | inject_perturbation=self.turns-1) 141 | return f.readfile(self.current_manipulation) 142 | 143 | def reset(self): 144 | self.turns = 0 145 | self.actions_taken = [] 146 | self.current_malware = self._choose_next_malware() 147 | self.used_malware.append(self.current_malware) 148 | self.current_manipulation = self.current_malware # For similarity 149 | self.original_bytez = f.readfile(self.current_malware) 150 | self.feature_space = self.feature_extractor.feature_vector(self.original_bytez) # Observation space 151 | self.history[self.current_malware] = [] 152 | 153 | return np.asarray(self.feature_space) 154 | 155 | def reset_completely(self): 156 | # Also reset used malware 157 | self.history = OrderedDict() 158 | self.reset() 159 | self.used_malware = [] 160 | 161 | # Randomly choose next malware, which has not been used 162 | # If all available malware samples have already been used, list is resetting 163 | def _choose_next_malware(self): 164 | temp_list = [malware for malware in self.malware_list if self.used_malware.count(malware) == 0] 165 | if len(temp_list) == 0: 166 | temp_list = self.malware_list 167 | self.used_malware = [] 168 | return random.choice(temp_list) 169 | 170 | # Reward = Detection (0/1) + Similarity (functions.get_difference) + Distance (self.turns) 171 | def _calculate_reward(self): 172 | max_reward = PARAM_DICT["maximum_reward"] # AIMED-RL Paper: R_max = 10 173 | 174 | # ** Detected: Is the malware detected by the model? ** 175 | # Not detected: good, detected: bad 176 | # Value: 0/10 177 | detected, confidence = self.detector_function(self.current_manipulation) 178 | # print("DETECTED FROM MODEL: ", detected, confidence) 179 | detected_reward = 0 180 | if not detected: 181 | detected_reward = max_reward 182 | 183 | # ** Similarity: How much does the manipulation differ from the original file? ** 184 | # Difference ~40%: Best (empirical estimate) 185 | # Value: 0-10 186 | 187 | # Gets back the difference between original and manipulated bytez, smaller value: more similar 188 | difference = self.similarity_function(self.current_manipulation, self.current_malware) 189 | original_length = len(self.original_bytez) 190 | similarity_reward = self._calculate_similiarity_reward(difference, original_length) 191 | 192 | # ** Distance: How many actions have been taken? ** 193 | # More turns mean more actions mean a more diverse action vector 194 | # Limit of 5 perturbations => Should be used completely 195 | # Value: #actions * (max_reward/max_perturbations) 196 | max_perturbations = PARAM_DICT["max_turns"] / 2 if PARAM_DICT["strategy_reset"] or PARAM_DICT["strategy_inject"] \ 197 | else PARAM_DICT["max_turns"] # Applying a strategy halves the amount of possible perturbations 198 | factor = max_reward / max_perturbations # 2 in AIMED-RL Paper 199 | distance_reward = len(self.actions_taken) * factor 200 | 201 | # Max value: max_reward 202 | reward = detected_reward * self.detected_weight + similarity_reward * self.similarity_weight \ 203 | + distance_reward * self.distance_weight 204 | 205 | if self.reward_punishment: 206 | punishment = self._calculate_doubled_perturbation_punishment() 207 | 208 | if detected: # Do not apply punishment if last perturbation lead to adversarial sample 209 | reward *= punishment 210 | 211 | return reward, detected, confidence 212 | 213 | # Calculate difference between best similarity (40%) and actual value and calculate reward accordingly 214 | def _calculate_similiarity_reward(self, difference, original): 215 | percent_sim = difference / original 216 | percent_best = 0.4 # Empirical estimate: 40% difference between original file could be best for adversarial file 217 | reward_sim = (1 - abs(percent_sim - percent_best)) * PARAM_DICT["maximum_reward"] # AIMED_RL Paper: R_max = 10 218 | return max(0, reward_sim) # No negative reward 219 | 220 | # Give punishment to doubled perturbations in actions_taken 221 | def _calculate_doubled_perturbation_punishment(self): 222 | no_punishment = 1 # no reduction 223 | punishment_doubled_once = 0.8 # 20% less reward 224 | punishment_doubled_twice = 0.6 # 40% less reward 225 | for action in self.actions_taken: 226 | if self.actions_taken.count(action) > 2: 227 | return punishment_doubled_twice 228 | if self.actions_taken.count(action) > 1: 229 | return punishment_doubled_once 230 | 231 | return no_punishment 232 | 233 | def render(self, mode='human', close=False): 234 | if self.current_malware is not None and self.history[self.current_malware] is not None: 235 | # print("Input object: " + str(self.current_malware)) 236 | if "RESET" in self.history[self.current_malware]: 237 | index_reset = self.history[self.current_malware].index("RESET") 238 | history_length = len(self.history[self.current_malware]) 239 | # print("Actions (before reset): " + str(self.history[self.current_malware][0:index_reset])) 240 | print("Actions (after reset): " + str( 241 | self.history[self.current_malware][index_reset + 1:history_length])) 242 | else: 243 | print("Actions: " + str(self.history[self.current_malware])) 244 | else: 245 | print("Environment has not been reset.") 246 | 247 | 248 | # The DQNSettings Enum summarizes the possible enhancements to the DQN Algorithm 249 | class DQNSettings(Enum): 250 | REPLAY_BUFFER = 1 251 | PRIORITIZED_REPLAY_BUFFER = 2 252 | ADAM_OPTIMIZER = 3 253 | LINEAR_DECAY_EPSILON_GREEDY = 4 254 | BOLTZMANN_EXPLORATION = 5 255 | NOISY_NETS = 6 256 | ALGO_DQN = 7 257 | ALGO_ACER = 8 258 | ALGO_DISTDQN = 9 259 | 260 | 261 | # Reinforcement learning agent using chainer-rl library 262 | class RlAgent: 263 | def __init__(self, environment: MalwareEnv): 264 | self.env = environment 265 | self.obs_size = len(environment.feature_space) 266 | self.n_actions = environment.action_space.n 267 | 268 | if DQNSettings.ALGO_ACER.name in PARAM_DICT["agent"]: 269 | self.agent = self.create_acer_agent() 270 | else: 271 | self.agent = self.create_dqn_agent() 272 | 273 | # For Algorithm Implementation see: https://github.com/endgameinc/gym-malware/blob/master/train_agent_chainer.py 274 | # Rainbow: https://github.com/chainer/chainerrl/tree/master/examples/atari/reproduction/rainbow 275 | def create_dqn_agent(self): 276 | q_func = None 277 | if DQNSettings.ALGO_DQN.name in PARAM_DICT["agent"]: 278 | q_func = QFunction(self.obs_size, self.n_actions) 279 | elif DQNSettings.ALGO_DISTDQN.name in PARAM_DICT["agent"]: 280 | q_func = chainerrl.q_functions.DistributionalFCStateQFunctionWithDiscreteAction( 281 | ndim_obs=self.obs_size, 282 | n_actions=self.n_actions, 283 | n_atoms=51, # See paper 284 | v_min=-10, # See paper 285 | v_max=10, # max reward 286 | n_hidden_layers=2, 287 | n_hidden_channels=64 288 | ) 289 | assert q_func is not None 290 | 291 | optimizer = None 292 | if DQNSettings.ADAM_OPTIMIZER.name in PARAM_DICT["optimizer"]: 293 | optimizer = chainer.optimizers.Adam(eps=PARAM_DICT["adam_epsilon"]) 294 | optimizer.setup(q_func) 295 | assert optimizer is not None 296 | 297 | explorer = None 298 | if DQNSettings.LINEAR_DECAY_EPSILON_GREEDY.name in PARAM_DICT["explorer"]: 299 | explorer = chainerrl.explorers. \ 300 | LinearDecayEpsilonGreedy(start_epsilon=1.0, 301 | end_epsilon=0.05, 302 | decay_steps=100, 303 | random_action_func=self.env.get_random_action) 304 | elif DQNSettings.BOLTZMANN_EXPLORATION.name in PARAM_DICT["explorer"]: 305 | explorer = chainerrl.explorers.Boltzmann(T=PARAM_DICT["boltzmann_temperature"]) 306 | elif DQNSettings.NOISY_NETS.name in PARAM_DICT["explorer"]: 307 | links.to_factorized_noisy(q_func, sigma_scale=0.5) # Sigma from chainerrl rainbow 308 | explorer = chainerrl.explorers.Greedy() # Turn off explorer (because of Noisy Nets) 309 | assert explorer is not None 310 | 311 | replay_buffer = None 312 | if DQNSettings.REPLAY_BUFFER.name in PARAM_DICT["replay_buffer"]: 313 | replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=PARAM_DICT["replay_buffer_capacity"]) 314 | elif DQNSettings.PRIORITIZED_REPLAY_BUFFER.name in PARAM_DICT["replay_buffer"]: 315 | betasteps = PARAM_DICT["max_turns"] * PARAM_DICT["episodes"] 316 | replay_buffer = chainerrl.replay_buffer.PrioritizedReplayBuffer( 317 | capacity=PARAM_DICT["replay_buffer_capacity"], 318 | alpha=0.6, 319 | beta0=0.4, 320 | betasteps=betasteps, # max_turns*episodes 321 | eps=0.01, 322 | normalize_by_max=True, 323 | error_min=0, 324 | error_max=1, 325 | num_steps=1) 326 | assert replay_buffer is not None 327 | 328 | phi = lambda obs: obs.astype(np.float32, copy=False) 329 | 330 | agent = None 331 | if DQNSettings.ALGO_DQN.name in PARAM_DICT["agent"]: 332 | agent = chainerrl.agents.DoubleDQN(q_function=q_func, 333 | optimizer=optimizer, 334 | replay_buffer=replay_buffer, 335 | explorer=explorer, 336 | gamma=PARAM_DICT["dqn_gamma"], 337 | replay_start_size=PARAM_DICT["replay_start_size"], 338 | update_interval=PARAM_DICT["update_interval"], 339 | target_update_interval=PARAM_DICT["target_update_interval"], 340 | phi=phi) 341 | elif DQNSettings.ALGO_DISTDQN.name in PARAM_DICT["agent"]: 342 | agent = chainerrl.agents.CategoricalDoubleDQN(q_function=q_func, 343 | optimizer=optimizer, 344 | replay_buffer=replay_buffer, 345 | gamma=PARAM_DICT["dqn_gamma"], 346 | explorer=explorer, 347 | minibatch_size=PARAM_DICT["minibatch_size"], 348 | replay_start_size=PARAM_DICT["replay_start_size"], 349 | target_update_interval=PARAM_DICT["target_update_interval"], 350 | update_interval=PARAM_DICT["update_interval"], 351 | batch_accumulator=PARAM_DICT["batch_accumulator"], 352 | phi=phi, 353 | ) 354 | assert agent is not None 355 | return agent 356 | 357 | # ACER agent was used in Gym Malware Environment. In this framework, however, it is not tested throughout 358 | def create_acer_agent(self): 359 | model = chainerrl.agents.acer.ACERSeparateModel( 360 | pi=links.Sequence( 361 | L.Linear(self.obs_size, 1024, initialW=LeCunNormal(1e-3)), 362 | F.relu, 363 | L.Linear(1024, 512, initialW=LeCunNormal(1e-3)), 364 | F.relu, 365 | L.Linear(512, self.n_actions, initialW=LeCunNormal(1e-3)), 366 | SoftmaxDistribution), 367 | q=links.Sequence( 368 | L.Linear(self.obs_size, 1024, initialW=LeCunNormal(1e-3)), 369 | F.relu, 370 | L.Linear(1024, 512, initialW=LeCunNormal(1e-3)), 371 | F.relu, 372 | L.Linear(512, self.n_actions, initialW=LeCunNormal(1e-3)), 373 | DiscreteActionValue), 374 | ) 375 | 376 | opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-2, alpha=0.99) 377 | opt.setup(model) 378 | opt.add_hook(chainer.optimizer.GradientClipping(40)) 379 | 380 | replay_buffer = EpisodicReplayBuffer(128) 381 | 382 | phi = lambda obs: obs.astype(np.float32, copy=False) 383 | 384 | agent = chainerrl.agents.ACER(model, opt, 385 | gamma=PARAM_DICT["dqn_gamma"], # reward discount factor 386 | t_max=32, # update the model after this many local steps 387 | replay_buffer=replay_buffer, 388 | n_times_replay=4, # number of times experience replay is repeated for each update 389 | replay_start_size=64, 390 | # don't start replay unless we have this many experiences in the buffer 391 | disable_online_update=True, # rely only on experience buffer 392 | use_trust_region=True, # enable trust region policy optimiztion 393 | trust_region_delta=0.1, # a parameter for TRPO 394 | truncation_threshold=5.0, # truncate large importance weights 395 | beta=1e-2, # entropy regularization parameter 396 | phi=phi) 397 | 398 | return agent 399 | 400 | def make_action(self, state, reward, train=True): 401 | if train: 402 | return self.agent.act_and_train(state, reward) 403 | 404 | return self.agent.act(state) 405 | 406 | def stop_episode_and_train(self, state, reward, done): 407 | self.agent.stop_episode_and_train(state, reward, done) 408 | 409 | def stop_episode(self): 410 | self.agent.stop_episode() 411 | 412 | def save_existing_agent(self, directory_agent): 413 | self.agent.save(directory_agent) 414 | distutils.dir_util.copy_tree(directory_agent, PARAM_DICT['save_agent']+'last/') 415 | 416 | def print_debug(self): 417 | print("RL AGENT: " + str(PARAM_DICT["name"])) 418 | print("Statistics: ", self.agent.get_statistics()) 419 | 420 | 421 | # See https://github.com/endgameinc/gym-malware/blob/master/train_agent_chainer.py 422 | class QFunction(chainer.Chain): 423 | def __init__(self, obs_size, n_actions): 424 | super(QFunction, self).__init__() 425 | n_hidden_channels = PARAM_DICT["dqn_hidden_size"] 426 | net = [] 427 | inp_dim = obs_size 428 | for i, n_hid in enumerate(n_hidden_channels): 429 | net += [('l{}'.format(i), L.Linear(inp_dim, n_hid))] 430 | net += [('norm{}'.format(i), L.BatchNormalization(n_hid))] 431 | net += [('_act{}'.format(i), F.relu)] 432 | inp_dim = n_hid 433 | 434 | net += [('output', L.Linear(inp_dim, n_actions))] 435 | 436 | with self.init_scope(): 437 | for n in net: 438 | if not n[0].startswith('_'): 439 | setattr(self, n[0], n[1]) 440 | 441 | self.forward = net 442 | 443 | def __call__(self, x, test=False): 444 | """ 445 | Args: 446 | x (ndarray or chainer.Variable): An observation 447 | test (bool): a flag indicating whether it is in test mode 448 | """ 449 | for n, f in self.forward: 450 | if not n.startswith('_'): 451 | x = getattr(self, n)(x) 452 | else: 453 | x = f(x) 454 | 455 | return chainerrl.action_value.DiscreteActionValue(x) 456 | 457 | 458 | class Logger: 459 | """ 460 | Logger class to write data during training/evaluation to a csv file 461 | It also creates a training or evaluation report at the end that summarizes the results. 462 | The report also contains the current version of the PARAM_DICT to make the experiments reproducible 463 | """ 464 | def __init__(self, directory_to_save, evaluate): 465 | self.directory = directory_to_save 466 | self.adversarial_samples = [] 467 | self.values_of_one_file = [] 468 | if evaluate: 469 | self.data_file_name = PARAM_DICT["name"] + "_" + str(PARAM_DICT["threshold"]) + "_eval_data.csv" 470 | else: 471 | self.data_file_name = PARAM_DICT["name"] + "_train_data.csv" 472 | 473 | def reset_after_error(self): 474 | self.values_of_one_file = [] 475 | 476 | def log_turn_values(self, detection_value, reward, turn, episode, adversarial, actions_taken, malware): 477 | self.values_of_one_file.append((detection_value, reward, turn, episode, adversarial, actions_taken, malware)) 478 | if adversarial: 479 | self.adversarial_samples.append( 480 | (detection_value, reward, turn, episode, adversarial, actions_taken, malware)) 481 | 482 | def write_sample_values_to_file(self): 483 | # Initial create 484 | if not os.path.isfile(self.directory + self.data_file_name): 485 | data_report = open(self.directory + self.data_file_name, 'w') 486 | data_report.write("detection_value,reward,turn,episode,adversarial,actions_taken,malware") 487 | data_report.close() 488 | 489 | data_report = open(self.directory + self.data_file_name, 'a') 490 | for detection_value, reward, turn, episode, adversarial, actions_taken, malware in self.values_of_one_file: 491 | data_report.write("\n") 492 | adver_value = "1" if adversarial else "0" 493 | actions_string = str(actions_taken).replace("'", "").replace(",", ";") 494 | report_string = str(detection_value) + "," + str(reward) + "," + str(turn) + "," + str(episode) + "," + \ 495 | str(adver_value) + "," + actions_string + "," + str(malware).split("/")[-1] 496 | data_report.write(report_string) 497 | data_report.close() 498 | self.values_of_one_file = [] 499 | 500 | def save_agent_training_test_report(self, total_time, average_q, average_loss, agent_number_updates): 501 | type_dict = PARAM_DICT.copy() 502 | for key in type_dict: 503 | type_dict[key] = type(PARAM_DICT[key]) 504 | with open(self.directory + str(PARAM_DICT["name"]) + "_training_report.csv", 'w') as agent_report: 505 | w = csv.DictWriter(agent_report, PARAM_DICT.keys()) 506 | w.writeheader() 507 | w.writerow(PARAM_DICT) 508 | w.writerow(type_dict) 509 | 510 | pref_act_vector = self._calculate_most_often_used_action_vector() 511 | 512 | agent_report.write("\nAverage Q: " + str(average_q)) 513 | agent_report.write("\nAverage Loss: " + str(average_loss)) 514 | agent_report.write("\nNumber Updates Agent: " + str(agent_number_updates)) 515 | agent_report.write("\nPreferred Action Vector: " + str(pref_act_vector)) 516 | agent_report.write("\nTotal Time: " + str(total_time)) 517 | agent_report.write("\nNumber adversarial samples: " + str(len(self.adversarial_samples))) 518 | agent_report.close() 519 | 520 | # Add a copy of the reports to the last/ dir 521 | distutils.dir_util.copy_tree(self.directory, PARAM_DICT['save_report']+'training_reports/last/') 522 | 523 | def save_agent_evaluation_report(self, total_time, number_errored, average_q, average_loss, agent_number_updates): 524 | if not os.path.isdir(PARAM_DICT["save_report"] + "evaluating_reports/"): 525 | os.mkdir(PARAM_DICT["save_report"] + "evaluating_reports/") 526 | with open(str(PARAM_DICT["save_report"] + "evaluating_reports/" + str(PARAM_DICT["name"]) + "_" + str(PARAM_DICT["threshold"]) + "_evaluation_report.csv"), 'w') as agent_report: 527 | w = csv.DictWriter(agent_report, PARAM_DICT.keys()) 528 | w.writeheader() 529 | w.writerow(PARAM_DICT) 530 | 531 | pref_act_vector = self._calculate_most_often_used_action_vector() 532 | 533 | agent_report.write("\nAverage Q: " + str(average_q)) 534 | agent_report.write("\nAverage Loss: " + str(average_loss)) 535 | agent_report.write("\nNumber Updates Agent: " + str(agent_number_updates)) 536 | agent_report.write("\nPreferred Action Vector: " + str(pref_act_vector)) 537 | agent_report.write("\nTotal Time: " + str(total_time)) 538 | agent_report.write("\nNumber adversarial samples: " + str(len(self.adversarial_samples))) 539 | agent_report.write("\nNumber errored: " + str(number_errored)) 540 | agent_report.close() 541 | 542 | def _calculate_most_often_used_action_vector(self): 543 | actions = [act for (v, re, t, r, adv, act, ma) in self.adversarial_samples] 544 | if not actions: 545 | return [] 546 | return max(actions, key=actions.count) 547 | 548 | 549 | def _create_env(malware_path, malware_detection_function, malware_analysis_function): 550 | try: 551 | samples = os.listdir(malware_path) 552 | for i in range(len(samples)): 553 | samples[i] = malware_path + samples[i] 554 | except NotADirectoryError: 555 | samples = [malware_path] # Only test one sample 556 | 557 | env = MalwareEnv(malware_list=samples, 558 | detection_function=malware_detection_function, 559 | analysis_function=malware_analysis_function) 560 | return env 561 | 562 | 563 | # Creates the directories where the training and evaluation data, as well as the agent is stored 564 | def _make_saving_directories(): 565 | if not os.path.isdir(PARAM_DICT["save_report"] + "training_reports/"): 566 | os.mkdir(PARAM_DICT["save_report"] + "training_reports/") 567 | date_and_time_now = str(datetime.now()).split(".")[0].replace(" ", "-").replace(":", "-")[0:-3] # no seconds 568 | 569 | directory_logging = PARAM_DICT["save_report"] + "training_reports/" + date_and_time_now + "/" 570 | os.makedirs(directory_logging) 571 | directory_agent = PARAM_DICT["save_agent"] + date_and_time_now + "/" 572 | os.makedirs(directory_agent) 573 | return directory_logging, directory_agent 574 | 575 | 576 | # Creates a new agent and trains it with the current parameters from the PARAM_DICT 577 | def train_and_save_agent(malware_detection, malware_analysis): 578 | directory_logging, directory_agent = _make_saving_directories() 579 | malware_detection_function = lambda sample: malware_detection(sample=sample, 580 | model=PARAM_DICT["detection_model"], 581 | threshold=PARAM_DICT["threshold"]) 582 | # Environment: 583 | env = _create_env(malware_path=PARAM_DICT["malware_path"], 584 | malware_detection_function=malware_detection_function, 585 | malware_analysis_function=malware_analysis) 586 | state = env.reset() 587 | env.render() 588 | # Agent: 589 | agent = RlAgent(environment=env) 590 | # agent.print_debug() 591 | 592 | # Logger: 593 | logger = Logger(directory_to_save=directory_logging, 594 | evaluate=False) 595 | start_time = time() 596 | 597 | # TRAIN: 598 | episodes = PARAM_DICT["episodes"] 599 | episode = 1 600 | while episode <= episodes: 601 | print("\n### Training # Episode: {} of {} ###".format(episode, episodes)) 602 | current_turn = 0 603 | reward, episode_over, info, errored = 0, False, {}, False 604 | while not episode_over: 605 | current_turn += 1 606 | action = agent.make_action(state, reward, train=True) 607 | print('\n## Turn: {} # Next action: {} ##'.format(current_turn, ACTIONS[action])) 608 | state, reward, episode_over, info = env.step(action) 609 | print("Reward in turn " + str(current_turn) + " : " + str(reward)) 610 | env.render() 611 | 612 | detected = info["detected"] 613 | detection_value = info["detected_confidence"] 614 | errored = info["errored"] # LIEF ERRORS DURING MANIPULATION (True/False) 615 | if not errored: 616 | logger.log_turn_values(detection_value=detection_value, 617 | reward=reward, 618 | turn=current_turn, 619 | episode=episode, 620 | adversarial=not detected, 621 | actions_taken=_map_action_indices_to_actions(env.actions_taken), 622 | malware=env.current_malware) 623 | elif errored: 624 | episode -= 1 # Ignore this episode 625 | print('Episode ignored due to manipulation errors. Restarting..') 626 | 627 | if not errored: 628 | agent.stop_episode_and_train(state, reward, episode_over) 629 | logger.write_sample_values_to_file() 630 | else: 631 | agent.stop_episode() # Do not train on errored malware 632 | logger.reset_after_error() 633 | 634 | state = env.reset() 635 | # print("Episode ended after " + str(current_turn) + " turns") 636 | # print("Reward after episode: " + str(reward) + "\n") 637 | 638 | episode += 1 639 | 640 | print("Training finished!") 641 | agent.save_existing_agent(directory_agent) 642 | avg_q = agent.agent.get_statistics()[0][1] 643 | avg_loss = agent.agent.get_statistics()[1][1] 644 | number_updates = agent.agent.get_statistics()[2][1] 645 | logger.save_agent_training_test_report(total_time=f.time_me(start_time), 646 | average_q=avg_q, 647 | average_loss=avg_loss, 648 | agent_number_updates=number_updates) 649 | return directory_logging 650 | 651 | 652 | # Loads the PARAM_DICT associated with an agent (in training report) 653 | def _load_agent_information(agent_information): 654 | global PARAM_DICT # Check 655 | # Load Dictionary: 656 | with open(agent_information, 'r') as file: 657 | r = csv.DictReader(file) 658 | loaded_dicts = [dict(d) for d in r] 659 | PARAM_DICT = loaded_dicts[0] 660 | type_dict = loaded_dicts[1] 661 | 662 | for key in PARAM_DICT: 663 | type_of_key_str = type_dict[key] 664 | if "int" in type_of_key_str: 665 | type_of_key = int 666 | elif "bool" in type_of_key_str: 667 | type_of_key = bool 668 | elif "float" in type_of_key_str: 669 | type_of_key = float 670 | elif "list" in type_of_key_str: 671 | type_of_key = list 672 | else: 673 | type_of_key = None 674 | if type_of_key is not None: 675 | if type_of_key == list: 676 | list_from_dict = str(PARAM_DICT[key]).replace("[", "").replace("]", "").split(",") 677 | map_to = int 678 | if "." in list_from_dict[0]: 679 | map_to = float 680 | PARAM_DICT[key] = list(map(map_to, list_from_dict)) 681 | elif type_of_key == bool: 682 | PARAM_DICT[key] = True if "True" in PARAM_DICT[key] else False 683 | else: 684 | PARAM_DICT[key] = type_of_key(PARAM_DICT[key]) 685 | 686 | 687 | # Evaluates a given agent against the model from the malware_detection function 688 | # Malware analysis can be optionally applied after the evaluation on successful adversarial examples 689 | def load_and_evaluate_agent(directory_agent, agent_information, evaluation_set_directory, 690 | malware_detection, malware_analysis): 691 | _load_agent_information(agent_information=agent_information) 692 | 693 | malware_detection_function = lambda sample: malware_detection(sample=sample, 694 | model=PARAM_DICT["detection_model"], 695 | threshold=PARAM_DICT["threshold"]) 696 | # Env 697 | env = _create_env(malware_path=evaluation_set_directory, 698 | malware_detection_function=malware_detection_function, 699 | malware_analysis_function=malware_analysis) 700 | state = env.reset() 701 | 702 | # Agent 703 | agent = RlAgent(environment=env) 704 | agent.agent.load(directory_agent) 705 | # agent.print_debug() 706 | 707 | # Logger: 708 | logger = Logger(directory_to_save=directory_agent, evaluate=True) 709 | start_time = time() 710 | 711 | # MANIPULATE: 712 | episodes = len(env.malware_list) 713 | episode = 1 714 | number_errored = 0 715 | while episode <= episodes: 716 | print("\n### Evaluation # Episode: {} of {} ###".format(episode, episodes)) 717 | current_turn = 0 718 | reward, episode_over, info, errored = 0, False, {}, False 719 | while not episode_over: 720 | current_turn += 1 721 | action = agent.make_action(state, reward, train=False) 722 | print('\n## Turn: {} # Next action: {} ##'.format(current_turn, ACTIONS[action])) 723 | state, reward, episode_over, info = env.step(action) 724 | env.render() 725 | # print("Action from agent: " + ACTIONS[action]) 726 | print("Reward in turn " + str(current_turn) + " : " + str(reward)) 727 | 728 | detected = info["detected"] 729 | detection_value = info["detected_confidence"] 730 | errored = info["errored"] # LIEF ERRORS DURING MANIPULATION (True/False) 731 | if not errored: 732 | logger.log_turn_values(detection_value=detection_value, 733 | reward=reward, 734 | turn=current_turn, 735 | episode=episode, 736 | adversarial=not detected, 737 | actions_taken=_map_action_indices_to_actions(env.actions_taken), 738 | malware=env.current_malware) 739 | 740 | # Episode does not get decreased (-= 1) like in training, rather we save how many errors we got 741 | elif errored: 742 | number_errored += 1 743 | 744 | agent.stop_episode() 745 | if not errored: 746 | logger.write_sample_values_to_file() 747 | else: 748 | logger.reset_after_error() 749 | 750 | state = env.reset() 751 | # print("Episode ended after " + str(current_turn) + " turns") 752 | # print("Reward after episode: " + str(reward) + "\n") 753 | 754 | episode += 1 755 | 756 | print("\nNumber errored: ", number_errored) 757 | print("Evaluation finished!") 758 | avg_q = agent.agent.get_statistics()[0][1] 759 | avg_loss = agent.agent.get_statistics()[1][1] 760 | number_updates = agent.agent.get_statistics()[2][1] 761 | logger.save_agent_evaluation_report(total_time=f.time_me(start_time), 762 | number_errored=number_errored, 763 | average_q=avg_q, 764 | average_loss=avg_loss, 765 | agent_number_updates=number_updates) 766 | 767 | 768 | def _map_action_indices_to_actions(actions_taken): 769 | actions = [] 770 | for index in actions_taken: 771 | actions.append(ACTIONS[index]) 772 | return actions 773 | 774 | 775 | PARAM_DICT = { 776 | "name": "AIMEDRL", 777 | "seed": 1234, 778 | "save_report": "db/rl/", 779 | "save_agent": "samples/rl/agent/", 780 | "malware_path": "samples/malware_set/", 781 | "episodes": 1000, 782 | "detection_model": "LightGBM", 783 | "threshold": 0.9, 784 | "max_turns": 10, 785 | "strategy_reset": True, 786 | "strategy_inject": False, 787 | "maximum_reward": 10, 788 | "weights": STANDARD_WEIGHTS, 789 | "reward_punishment": True, 790 | "agent": DQNSettings.ALGO_DISTDQN.name, 791 | "optimizer": DQNSettings.ADAM_OPTIMIZER.name, 792 | "adam_epsilon": 1e-2, 793 | "dqn_gamma": 0.95, 794 | "dqn_replay_start_size": 32, 795 | "replay_buffer": DQNSettings.PRIORITIZED_REPLAY_BUFFER.name, 796 | "replay_buffer_capacity": 1000, 797 | "dqn_hidden_size": [64, 16], 798 | "explorer": DQNSettings.NOISY_NETS.name, 799 | "epsilon_greedy_start_epsilon": 1.0, 800 | "epsilon_greedy_end_epsilon": 0.05, 801 | "epsilon_greedy_decay_steps": 100, 802 | "boltzmann_temperature": 1.0, 803 | "replay_start_size": 32, 804 | "minibatch_size": 32, 805 | "batch_accumulator": "mean", 806 | "update_interval": 1, 807 | "target_update_interval": 100 808 | } 809 | --------------------------------------------------------------------------------