├── db
    ├── .gitkeep
    └── rl
    │   └── .gitkeep
├── tests
    ├── __init__.py
    └── test_main.py
├── samples
    ├── mod
    │   └── .gitkeep
    ├── npz
    │   └── .gitkeep
    ├── uap
    │   └── .gitkeep
    ├── malware_set
    │   └── .gitkeep
    ├── successful
    │   ├── .gitkeep
    │   └── detected
    │   │   └── .gitkeep
    ├── unsuccessful
    │   └── .gitkeep
    └── rl
    │   └── evaluation_set
    │       └── .gitkeep
├── data
    ├── lgbm_ember.pkl
    ├── lgbm_sorel.pkl
    ├── gradient_boosting.pkl
    ├── section_names.txt
    ├── manipulate.py
    └── pefeatures.py
├── codecov.yml
├── src
    ├── config.py
    ├── setup.py
    ├── config.ini
    ├── plot.py
    ├── gp.py
    ├── defense.py
    ├── functions.py
    └── rl.py
├── requirements.txt
├── docs
    ├── Makefile
    ├── make.bat
    ├── conf.py
    └── index.rst
├── .github
    └── workflows
    │   └── main.yml
├── Install.md
├── main.py
├── README.md
└── License


/db/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/db/rl/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/samples/mod/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/samples/npz/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/samples/uap/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/samples/malware_set/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/samples/successful/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/samples/unsuccessful/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/samples/rl/evaluation_set/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/samples/successful/detected/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/lgbm_ember.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zRapha/FAME/HEAD/data/lgbm_ember.pkl


--------------------------------------------------------------------------------
/data/lgbm_sorel.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zRapha/FAME/HEAD/data/lgbm_sorel.pkl


--------------------------------------------------------------------------------
/data/gradient_boosting.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zRapha/FAME/HEAD/data/gradient_boosting.pkl


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   require_ci_to_pass: true
 3 | 
 4 | coverage:
 5 |   precision: 2
 6 |   round: down
 7 |   range: "70...100"
 8 | 
 9 | parsers:
10 |   gcov:
11 |     branch_detection:
12 |       conditional: yes
13 |       loop: yes
14 |       method: no
15 |       macro: no
16 | 
17 | comment:
18 |   layout: "reach,diff,flags,files,footer"
19 |   behavior: default
20 |   require_changes: false
21 | 


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import configparser
 3 | 
 4 | 
 5 | def load_config(c):
 6 | 	"""
 7 | 		Load configuration data.
 8 | 	"""
 9 | 
10 | 	config = ''
11 | 	try:
12 | 		path = os.path.dirname(os.path.realpath(__file__))
13 | 		f = '/'.join([path, c])
14 | 		config = configparser.ConfigParser()
15 | 		config.read(f)
16 | 	except Exception as e:
17 | 		print("Error: {}".format(e))
18 | 	return config
19 | 
20 | 
21 | file = load_config('config.ini')


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Requirements file to install all packages needed for FAME: 
 2 | # $ python3.7 -m venv fame-env & pip install -r requirements.txt
 3 | 
 4 | numpy==1.19.0
 5 | pandas==0.25.0
 6 | requests==2.28.2
 7 | scikit-learn==0.21.2
 8 | scipy== 1.5.1
 9 | lief==0.10.1
10 | lightgbm==2.3.1
11 | joblib==1.2.0
12 | chainer==7.8.0
13 | chainerrl==0.8.0
14 | pytest==6.2.5
15 | coverage==6.0
16 | tqdm~=4.62.3
17 | sphinx==4.2.0
18 | 
19 | gym~=0.19.0
20 | setuptools~=57.0.0
21 | 


--------------------------------------------------------------------------------
/data/section_names.txt:
--------------------------------------------------------------------------------
 1 | .text
 2 | .rsrc
 3 | .reloc
 4 | .data
 5 | .rdata
 6 | .idata
 7 | .tls
 8 | .brdata
 9 | .bss
10 | .pdata
11 | .xdata
12 | DATA
13 | CODE
14 | BSS
15 | rdata
16 | .rmnet
17 | .CRT
18 | .edata
19 | .extrel
20 | .sdata
21 | .code
22 | .vmp0
23 | .itext
24 | .data2
25 | .data1
26 | .vmp1
27 | .adata
28 | .gfids
29 | .data3
30 | INIT
31 | .extjmp
32 | .didat
33 | .didata
34 | PAGE
35 | .orpc
36 | vryeypb
37 | camztlf
38 | tkjdelw
39 | dgbwqbp
40 | odyqxub
41 | .tsuarch
42 | .tsustub
43 | .textbss
44 | .sxdata
45 | .zrdata
46 | qxejodg
47 | .data-co
48 | .text-co
49 | gumrkvc
50 | rqvmxkb
51 | kakxcjb
52 | .cdata
53 | ExeS
54 | .rrdata


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.https://www.sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/src/setup.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from setuptools import setup
 3 | 
 4 | def read_file(filename): 
 5 |     with io.open(filename, mode='r', encoding='utf-8') as fd:
 6 |         return fd.read()
 7 | 
 8 | setup(
 9 |     name='FAMEwork',
10 |     version='0.1.5',
11 |     use_scm_version=False,
12 |     setup_requires=['setuptools_scm'],
13 |     include_package_data=True,
14 |     packages=['.'],
15 |     install_requires=[
16 |     'numpy==1.19.0',
17 |     'pandas==0.25.0',
18 |     'requests==2.28.2',
19 |     'scikit-learn==0.21.2',
20 |     'scipy== 1.5.1',
21 |     'lief==0.10.1',
22 |     'lightgbm==2.3.1',
23 |     'joblib==1.2.0',
24 |     'chainer==7.8.0',
25 |     'chainerrl==0.8.0',
26 |     'pytest==6.2.5',
27 |     'coverage==6.0',
28 |     'tqdm~=4.62.3',
29 |     'sphinx==4.2.0',
30 |     'gym~=0.19.0',
31 |     'setuptools~=57.0.0'],
32 |     url='https://github.com/zRapha/FAME',
33 |     license='MPL-2.0',
34 |     author='Raphael Labaca Castro',
35 |     author_email='mail@rapha.ai',
36 |     description='Framework for Adversarial Malware Evaluation', 
37 |     long_description=read_file('PyPI.md'), 
38 |     long_description_content_type='text/markdown', 
39 |     platforms=['Fedora 30, Ubuntu 16'], 
40 |     entry_points={'console_scripts': ['fame = main:main',]}
41 | )
42 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: FAME
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-22.04
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v2
16 |     - name: Set up Python 3.7.15
17 |       uses: actions/setup-python@v2
18 |       with:
19 |         python-version: 3.7.15
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install pip==23.0.1
23 |         pip install flake8 pytest
24 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
25 |     - name: Lint with flake8
26 |       run: |
27 |         # stop the build if there are Python syntax errors or undefined names
28 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
29 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
30 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
31 |     - name: Test with pytest
32 |       run: |
33 |         pytest
34 |     - name: Generate coverage report
35 |       run: |
36 |         coverage run -m pytest
37 |         coverage xml
38 |     - name: Upload coverage to Codecov
39 |       uses: codecov/codecov-action@v2
40 |       with:
41 |         fail_ci_if_error: true
42 |         token: 9e4c2ef9-14f0-466b-b8b2-ba3eefddb8e4
43 | 


--------------------------------------------------------------------------------
/src/config.ini:
--------------------------------------------------------------------------------
 1 | [armed]
 2 | rounds=100
 3 | perturbations=5
 4 | advFilesExpected=100
 5 | model=EMBER
 6 | integrityCheck=False
 7 | 
 8 | [aimed]
 9 | rounds=100
10 | perturbations=5
11 | advFilesExpected=1
12 | sizePopulation=4
13 | model=EMBER
14 | integrityCheck=False
15 | searchUAP=False
16 | 
17 | [aimedrl]
18 | perturbations=5
19 | model=EMBER
20 | train=True
21 | evaluate=True
22 | 
23 | [gameup]
24 | perturbations=10
25 | model=EMBER
26 | integrityCheck=False
27 | 
28 | [defense]
29 | perturbations=10
30 | model=EMBER
31 | 
32 | [compare]
33 | rounds=100
34 | perturbations=5
35 | advFilesExpected=1
36 | model=EMBER
37 | 
38 | [apiKeys]
39 | vt=
40 | ha=
41 | md=
42 | 
43 | [paths]
44 | db=db/
45 | npz=samples/npz/
46 | mod=samples/mod/
47 | fail=samples/unsuccessful/
48 | evasion=samples/successful/
49 | detected=samples/successful/detected/
50 | malware_set=samples/malware_set/
51 | 
52 | exploration=samples/uap/greedy/EMBER/exploration_set/
53 | validation=samples/uap/greedy/EMBER/validation_set/
54 | 
55 | rl=samples/rl/
56 | report=db/rl/training_reports/last/
57 | 
58 | model_path = data/models/
59 | vectorized_path = samples/ember/
60 | 
61 | [db]
62 | fields=['Original_File',
63 |      	'OF_Detections',
64 |      	'Manipulated_File',
65 | 	'MF_Detections',
66 | 	'Perturbations',
67 | 	'Perturbations_Injected',
68 | 	'Full_Detections_Report',
69 | 	'Full_Analysis_Report', 
70 | 	'Mod_File_Hash', 
71 | 	'Original_File_Hash', 
72 | 	'Date_Reported']
73 | 
74 | [remote]
75 | useVT = False
76 | useHA = False
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'FAME'
21 | copyright = '2022, Raphael Labaca Castro'
22 | author = 'Raphael Labaca Castro'
23 | 
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 | ]
32 | 
33 | # Add any paths that contain templates here, relative to this directory.
34 | templates_path = ['_templates']
35 | 
36 | # List of patterns, relative to source directory, that match files and
37 | # directories to ignore when looking for source files.
38 | # This pattern also affects html_static_path and html_extra_path.
39 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
40 | 
41 | 
42 | # -- Options for HTML output -------------------------------------------------
43 | 
44 | # The theme to use for HTML and HTML Help pages.  See the documentation for
45 | # a list of builtin themes.
46 | #
47 | html_theme = 'alabaster'
48 | 
49 | # Add any paths that contain custom static files (such as style sheets) here,
50 | # relative to this directory. They are copied after the builtin static files,
51 | # so a file named "default.css" will overwrite the builtin "default.css".
52 | html_static_path = ['_static']
53 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. FAMEwork documentation master file, created by
 2 |    sphinx-quickstart on Thu Oct  7 14:19:09 2021.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | ====================================
 7 | Welcome to FAME's documentation!
 8 | ====================================
 9 | 
10 | FAME was designed to understand how byte-level transformations could automatically be injected to Windows Portable Executable (PE) files and compromise ML-based malware classifiers. Moreover, it supports integrity verification to ensure that the new adversarial examples are valid. This work implements the action space proposed on the [OpenAI gym malware](https://github.com/endgameinc/gym-malware) environment. It has been implemented in Fedora 30 and tested on Ubuntu 16 using Python3. Library versions are defined in requirements.txt file.
11 | 
12 | The framework consists of four modules, namely, ARMED, AIMED, AIMED-RL & GAME-UP
13 | 
14 | GAME-UP: Generating Adversarial Malware Examples with Universal Perturbations
15 | 
16 | This work intends to understand how Universal Adversarial Perturbations (UAPs) can be useful to create efficient adversarial examples compared to input-specific attacks. Furthermore, it explores how real malware examples in the problem-space affect the feature-space of classifiers to identify systematic weaknesses. Also, it implements a variant of adversarial training to improve the resilience of static ML-based malware classifiers for Windows PE binaries.
17 | 
18 | AIMED-RL: Automatic Intelligent Modifications to Evade Detection (with Reinforcement Learning)
19 | 
20 | This work is focused on understanding how sensitive static malware classifiers are to adversarial examples. It uses different techniques including Genetic Programming (GP) and Reinforcement Learning (RL) to inject perturbations to Windows portable executable malware without compromising its functionality and, thus, keeping the new generated adversarial example valid.
21 | 
22 | .. toctree::
23 |    :maxdepth: 2
24 |    :caption: Contents:
25 | 
26 | 
27 | 
28 | Indices and tables
29 | ==================
30 | 
31 | * :ref:`genindex`
32 | * :ref:`modindex`
33 | * :ref:`search`
34 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import src.config as cfg
 3 | 
 4 | 
 5 | class TestMethods(unittest.TestCase):
 6 | 	def test_armed_config(self):
 7 | 		assert cfg.file.getint('armed', 'perturbations') > 0
 8 | 		assert cfg.file.getint('armed', 'advFilesExpected') > 0
 9 | 		assert cfg.file.getint('armed', 'rounds') >= cfg.file.getint('armed', 'advFilesExpected')
10 | 		assert cfg.file['armed']['model'] == "EMBER" or cfg.file['armed']['model'] == "SOREL"
11 | 		assert cfg.file.getboolean('armed', 'integrityCheck') is True or \
12 | 			   cfg.file.getboolean('armed', 'integrityCheck') is False
13 | 
14 | 	def test_aimed_config(self):
15 | 		assert cfg.file.getint('aimed', 'perturbations') > 0
16 | 		assert cfg.file.getint('aimed', 'advFilesExpected') > 0
17 | 		assert cfg.file.getint('aimed', 'sizePopulation') >= 2
18 | 		assert cfg.file['aimed']['model'] == "EMBER" or cfg.file['aimed']['model'] == "SOREL"
19 | 		assert cfg.file.getboolean('aimed', 'integrityCheck') is True or \
20 | 			   cfg.file.getboolean('aimed', 'integrityCheck') is False
21 | 
22 | 	def test_aimedrl_config(self):
23 | 		assert cfg.file.getint('aimedrl', 'perturbations') > 0
24 | 		assert cfg.file['aimedrl']['model'] == "EMBER" or cfg.file['aimedrl']['model'] == "SOREL"
25 | 		assert cfg.file.getboolean('aimedrl', 'train') is True or \
26 | 			   cfg.file.getboolean('aimedrl', 'train') is False
27 | 		assert cfg.file.getboolean('aimedrl', 'evaluate') is True or \
28 | 			   cfg.file.getboolean('aimedrl', 'evaluate') is False
29 | 
30 | 	def test_gameup_config(self):
31 | 		assert cfg.file.getint('gameup', 'perturbations') > 0
32 | 		assert cfg.file['gameup']['model'] == "EMBER" or cfg.file['gameup']['model'] == "SOREL"
33 | 		assert cfg.file.getboolean('gameup', 'integrityCheck') is True or \
34 | 			   cfg.file.getboolean('gameup', 'integrityCheck') is False
35 | 
36 | 	def test_defense_config(self):
37 | 		assert cfg.file.getint('defense', 'perturbations') > 0
38 | 		assert cfg.file['defense']['model'] == "EMBER" or cfg.file['defense']['model'] == "SOREL"
39 | 
40 | 	def test_compare_config(self):
41 | 		assert cfg.file.getint('compare', 'perturbations') > 0
42 | 		assert cfg.file.getint('compare', 'advFilesExpected') > 0
43 | 		assert cfg.file.getint('compare', 'rounds') >= cfg.file.getint('compare', 'advFilesExpected')
44 | 		assert cfg.file['compare']['model'] == "EMBER" or cfg.file['compare']['model'] == "SOREL"
45 | 
46 | 
47 | if __name__ == '__main__':
48 | 	unittest.main()
49 | 


--------------------------------------------------------------------------------
/Install.md:
--------------------------------------------------------------------------------
 1 | # Installation instructions (dev mode)
 2 | 
 3 | Clone the FAME repository:
 4 | ```
 5 | git clone git@github.com:zRapha/FAME.git
 6 | ```
 7 | Create a virtual environment & activate it:
 8 | ```
 9 | python3.7 -m venv fame-env
10 | source fame-env/bin/activate
11 | ```
12 | Update pip if needed (pip<=23.0.1):
13 | ```
14 | python -m pip install pip==23.0.1
15 | ```
16 | 
17 | Install required packages:
18 | ```
19 | pip install -r requirements.txt
20 | ```
21 | ## Integrity test verification  
22 | Per default the functionality stage is implemented using Cuckoo, an analysis environment that has an extensive [documentation](https://cuckoo.readthedocs.io/en/latest/introduction/what/). Cuckoo provides dynamic analysis results, which can be useful to understand the adversarial examples generated. A local beta-test implementation is also provided for further extension.
23 | 
24 | ## Malware classification   
25 | Local classification models are implemented to perform detection using  pre-trained malware classifier, namely, LightGBM trained with both EMBER and SOREL datasets. For those interested in more classifiers, we provide the option of using aggregators via REST APIs in order to assess adversarial examples against a wider range of commercial engines.
26 | 
27 | ## Dataset
28 | There are several public repositories containing labeled malicious files to test the environment. Once the data is acquired, it should be placed under the `samples/malware_set/` folder.
29 | 
30 | ## Further environment isolation [optional]
31 | Even though the manipulations do not require to run any file, the integrity verification stage does. Hence, it is  recommended to use isolated sandboxes and simulated services. One option is to use _inetsim_.
32 | 
33 | Disable interface:
34 | ```
35 | sudo ifconfig <network_int> down
36 | ```
37 | 
38 | Run inetsim (tested version 1.2.8):
39 | ```
40 | cd /etc/default/inetsim/
41 | sudo ./inetsim
42 | ```
43 | 
44 | Note that automatically retrieving the detection rate for a malware file from an online aggregator will no longer be functional unless adjusted manually.
45 | 
46 | ## How to run FAME
47 | 
48 | ### 1. Activate Cuckoo Python venv:
49 | ```
50 | source ~/cuckoo-env/bin/activate
51 | ```
52 | 
53 | > If integrity verification is implemented proceed with _2_, otherwise jump to _5_. 
54 | 
55 | ### 2. Run Mongo DB for webserver:
56 | ```
57 | sudo service mongod start
58 | ```
59 | 
60 | ### 3. Run webserver [optional]:
61 | ```
62 | cd ~/.cuckoo/
63 | cuckoo web
64 | ``` 
65 | 
66 | ### 4. Run API & Cuckoo sandbox:
67 | ```
68 | cuckoo api
69 | cuckoo
70 | ```
71 | 
72 | ### 5. Adjust configuration and initial parameters:
73 | ```
74 | vi config.ini
75 | ```
76 | 
77 | ### 6. Run FAME:
78 | ```
79 | ./main.py aimed
80 | ```
81 | 
82 | ## Segmentation fault 
83 | We have observed that injecting some combinations of perturbations to specific PE files raise segmentation fault 
84 | issues. Due to the nature of memory violations and the occurrence of this issue (in our experiments less than 0.02% of 
85 | the cases) we recommend either adjusting the transformations' sequence to a different combination or trying a new example. 
86 | Sometimes not patching the original import table, setting `builder.patch_imports(False)` may also help prevent this issue. 
87 | A workaround is curating the dataset by identifying the PE file and excluding it from the process.
88 | 
89 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Welcome to the Framework for Adversarial Malware Evaluation (FAME)
 4 | 
 5 | FAME was designed to understand how byte-level transformations could automatically be injected to Windows Portable
 6 | Executable (PE) files and compromise ML-based malware classifiers. Moreover, it supports integrity verification to
 7 | ensure that the new adversarial examples are valid. This work implements the action space proposed on the OpenAI gym
 8 | malware environment. It has been implemented in Fedora 30 and tested on Ubuntu 16 using Python3. Library versions are
 9 | defined in requirements.txt file.
10 | 
11 | The following modules are available: ARMED, AIMED, AIMED-RL & GAME-UP
12 | 
13 | GAME-UP: Generating Adversarial Malware Examples with Universal Perturbations
14 | 
15 | This work intends to understand how Universal Adversarial Perturbations (UAPs) can be useful to create efficient
16 | adversarial examples compared to input-specific attacks. Furthermore, it explores how real malware examples in the
17 | problem-space affect the feature-space of classifiers to identify systematic weaknesses. Also, it implements a variant
18 | of adversarial training to improve the resilience of static ML-based malware classifiers for Windows PE binaries.
19 | 
20 | AIMED-RL: Automatic Intelligent Modifications to Evade Detection (with Reinforcement Learning)
21 | 
22 | This work is focused on understanding how sensitive static malware classifiers are to adversarial examples. It uses
23 | different techniques including Genetic Programming (GP) and Reinforcement Learning (RL) to inject perturbations to
24 | Windows portable executable malware without compromising its functionality and, thus, keeping the new generated
25 | adversarial example valid.
26 | 
27 | """
28 | 
29 | import sys
30 | import time
31 | import src.config as cfg
32 | import src.functions as f
33 | import src.implementation as i
34 | 
35 | 
36 | def main(argv=sys.argv[1]):
37 | 	option = argv.upper()
38 | 
39 | 	# Time algorithm
40 | 	start = time.time()
41 | 
42 | 	# ARMED: Finding adversarial malware examples stochastically
43 | 	if option == 'ARMED':
44 | 		i.armed(number_perturbations=cfg.file.getint('armed', 'perturbations'),
45 | 				rounds=cfg.file.getint('armed', 'rounds'), files_expected=cfg.file.getint('armed', 'advFilesExpected'),
46 | 				model=cfg.file['armed']['model'])
47 | 
48 | 	# ARMED II: Using Incremental Iterations of perturbations' sequence
49 | 	elif option == 'ARMED-II':
50 | 		i.armed2(number_perturbations=cfg.file.getint('armed', 'perturbations'),
51 | 				 rounds=cfg.file.getint('armed', 'rounds'),
52 | 				 files_expected=cfg.file.getint('armed', 'advFilesExpected'),
53 | 				 model=cfg.file['armed']['model'])
54 | 
55 | 	# AIMED: Finding adversarial examples with genetic programming
56 | 	elif option == 'AIMED':
57 | 		i.aimed(size_population=cfg.file.getint('aimed', 'sizePopulation'),
58 | 				number_perturbations=cfg.file.getint('aimed', 'perturbations'),
59 | 				model=cfg.file['aimed']['model'])
60 | 
61 | 	# AIMED-RL: Finding adversarial examples with reinforcement learning
62 | 	elif option == 'AIMED-RL':
63 | 		i.aimed_rl(base_path=cfg.file['paths']['rl'],
64 | 				   report_path=cfg.file['paths']['report'],
65 | 				   train=cfg.file.getboolean('aimedrl', 'train'),
66 | 				   evaluate=cfg.file.getboolean('aimedrl', 'evaluate'))
67 | 
68 | 	# GAME-UP: Find universal perturbation sequences to generate adversarial examples
69 | 	elif option == 'GAMEUP':
70 | 		i.gameup(number_perturbations=cfg.file.getint('gameup', 'perturbations'), model=cfg.file['gameup']['model'],
71 | 				 exploration_set=cfg.file['paths']['exploration'],)
72 | 
73 | 	# UAP-DEF: Use UAPs to increase resilience of models against universal attacks
74 | 	elif option == 'DEFENSE':
75 | 		i.defense(number_perturbations=cfg.file.getint('defense', 'perturbations'),
76 | 				  model=cfg.file['defense']['model'])
77 | 
78 | 	# COMPARE: Evaluate different algorithms (Example imp.: AIMED vs ARMED)
79 | 	elif option == 'COMPARE':
80 | 		i.comparing(number_perturbations=cfg.file.getint('compare', 'perturbations'),
81 | 					rounds=cfg.file.getint('compare', 'rounds'),
82 | 					files_expected=cfg.file.getint('compare', 'advFilesExpected'),
83 | 					model=cfg.file['compare']['model'])
84 | 
85 | 	else:
86 | 		exit('Option not found!')
87 | 
88 | 	f.time_me(start)
89 | 
90 | 
91 | if __name__ == '__main__':
92 | 	main()
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # FAME
  2 | 
  3 | ![Workflow](https://github.com/zrapha/fame/actions/workflows/main.yml/badge.svg)
  4 | [![codecov](https://codecov.io/gh/zRapha/famework/branch/master/graph/badge.svg?token=oMFazw4iLl)](https://codecov.io/gh/zRapha/famework)
  5 | [![License: MPL v2](https://img.shields.io/badge/license-MPL--2.0-blue.svg)](https://www.mozilla.org/en-US/MPL/2.0/)
  6 | 
  7 | 
  8 | <!--
  9 | [![PyPI version](https://badge.fury.io/py/ttkwidgets.svg)](https://badge.fury.io/py/ttkwidgets)
 10 | -->
 11 | 
 12 | ## Welcome to the Framework for Adversarial Malware Evaluation 
 13 | 
 14 | FAME has been designed to evaluate ML-based malware classifiers against adversarial examples. It aims to provide understanding on how byte-level transformations can be injected into Windows Portable Executable (PE) files and compromise models. Moreover, it supports integrity verification to ensure that the adversarial examples remain valid after manipulation. This work implements the action space proposed on the [OpenAI gym malware](https://github.com/endgameinc/gym-malware) environment. It has been implemented and tested using Fedora 30 and Ubuntu 16 with Python3. Library versions are defined in the `requirements.txt` file.
 15 | 
 16 | The framework consists of the following modules: ARMED, AIMED / AIMED-RL & GAME-UP. 
 17 | 
 18 | ### GAME-UP: Generating Adversarial Malware Examples with Universal Perturbations
 19 | 
 20 | This module intends to analyze how Universal Adversarial Perturbations (UAPs) can be useful to create efficient adversarial examples compared to input-specific attacks. It explores how real malware examples in the problem-space affect the feature-space of classifiers to identify systematic weaknesses. Also, it implements a variant of adversarial training to improve the resilience of static ML-based malware classifiers for Windows PE binaries.
 21 | 
 22 | ### AIMED: Automatic Intelligent Modifications to Evade Detection
 23 | 
 24 | This approach focus on understanding how sensitive static malware classifiers are to adversarial examples. It uses different techniques including Genetic Programming (GP) and Reinforcement Learning (RL) to inject perturbations to Windows PE malware without compromising its functionality, keeping the frehsly generated adversarial example valid.
 25 | 
 26 | ### ARMED: Automatic Random Modifications to Evade Detection
 27 | 
 28 | With this option sequences of transformations are chosen randomly to identify weakspots in the classifier. This module implements a pipeline that is able to automatically generate realizable adversarial examples in the malware context. 
 29 | 
 30 | ## How to run FAME 
 31 | 
 32 | Here we describe how to run `FAME` by installing directly the package from `pip`. For more detail about running from source and manual configuration of parameters refer to the [install](https://github.com/zRapha/FAME/blob/master/Install.md) instructions. 
 33 | 
 34 | Install `FAME`:
 35 | ```
 36 | pip install famework
 37 | ```
 38 | Run `FAME` with any module (e.g., AIMED):
 39 | ```
 40 | fame aimed
 41 | ```
 42 | 
 43 | ## Contributors 
 44 | 
 45 | We appreciate the contributions that helped to improve this work: 
 46 | 
 47 | | Contributor     | University                     | Module                 |
 48 | |-----------------|--------------------------------|------------------------|
 49 | | Sebastian Franz | Technische Universität München | Reinforcement Learning |
 50 | 
 51 | ## Citation  
 52 | 
 53 | If you find this work useful you are highly encouraged to cite the following articles. For the framework, you can refer to my dissertation:
 54 | 
 55 | `FAME`
 56 | ```
 57 | @book{labaca-castro2023fame,
 58 |   title={Machine Learning under Malware Attack},
 59 |   author={Labaca-Castro, Raphael},
 60 |   year={2023},
 61 |   publisher={Springer Nature}
 62 | }
 63 | ```
 64 | ---
 65 | If you worked with more specific modules feel free to reference them separately:
 66 | 
 67 | `GAME-UP`
 68 | ```
 69 | @article{labaca-castro2022universal,
 70 |   title={Realizable Universal Adversarial Perturbations for Malware},
 71 |   author={Labaca-Castro, Raphael and Mu{\~n}oz-Gonz{\'a}lez, Luis and Pendlebury, Feargus and Rodosek, Gabi Dreo and Pierazzi, Fabio and Cavallaro, Lorenzo},
 72 |   journal={arXiv preprint arXiv:2102.06747},
 73 |   year={2022}
 74 | }
 75 | ```
 76 | 
 77 | `AIMED-RL`
 78 | ```
 79 | @inproceedings{labaca-castro2021aimed-rl,
 80 |   title={AIMED-RL: Exploring Adversarial Malware Examples with Reinforcement Learning },
 81 |   author={Labaca-Castro, Raphael and Franz, Sebastian and Rodosek, Gabi Dreo},
 82 |   booktitle={Joint European Conference on Machine Learning and Knowledge Discovery in Databases (ECML PKDD)},
 83 |   pages={37--52},
 84 |   year={2021},
 85 |   organization={Springer}
 86 | }
 87 | ```
 88 | 
 89 | `AIMED`
 90 | ```
 91 | @inproceedings{labaca-castro2019aimed,
 92 |   title={AIMED: Evolving Malware with Genetic Programming to Evade Detection},
 93 |   author={Labaca-Castro, Raphael and Schmitt, Corinna and Rodosek, Gabi Dreo},
 94 |   booktitle={2019 18th IEEE International Conference On Trust, Security And Privacy In Computing And Communications/13th IEEE International Conference On Big Data Science And Engineering (TrustCom/BigDataSE)},
 95 |   pages={240--247},
 96 |   year={2019},
 97 |   organization={IEEE}
 98 | }
 99 | ```
100 | 
101 | `ARMED`
102 | ```
103 | @inproceedings{labaca-castro2019armed,
104 |   title={ARMED: How Automatic Malware Modifications Can Evade Static Detection?},
105 |   author={Labaca-Castro, Raphael and Schmitt, Corinna and Rodosek, Gabi Dreo},
106 |   booktitle={2019 5th International Conference on Information Management (ICIM)},
107 |   pages={20--27},
108 |   year={2019},
109 |   organization={IEEE}
110 | }
111 | ```
112 | 


--------------------------------------------------------------------------------
/src/plot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import matplotlib
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | from collections import Counter
  7 | 
  8 | matplotlib.use('Agg')
  9 | 
 10 | 
 11 | def csv_into_list(CSV, sample):
 12 | 	# Setting fields for CSV
 13 | 	fields = ['Original_File', 'OF_Detections', 'Manipulated_File', 'MF_Detections', 'Perturbations',
 14 | 			  'Perturbations_Injected',
 15 | 			  'Full_Detections_Report', 'Full_Analysis_Report', 'Mod_File_Hash', 'Original_File_Hash', 'Date_Reported']
 16 | 
 17 | 	# Retrieve database
 18 | 	df = pd.read_csv(CSV, names=fields, header=None)
 19 | 
 20 | 	# Use only rows about %sample%
 21 | 	df = df.loc[df['Original_File'] == 'samples/' + sample]
 22 | 	if df.empty:
 23 | 		print('No samples found with that name in the database.')
 24 | 		quit()
 25 | 
 26 | 	# Identifying x, y & cleaning out detections values. Only for success
 27 | 	# samples otherwise just get perturbations as there is no detections
 28 | 	if not df['MF_Detections'].isnull().any():
 29 | 		# print(df['MF_Detections'])
 30 | 		df['MF_Detections'] = df['MF_Detections'].map(lambda x: x[:2])
 31 | 		detections = df['MF_Detections'].values.tolist()
 32 | 
 33 | 		for i in range(len(detections)):
 34 | 			if '/' in detections[i]:
 35 | 				detections[i] = detections[i][:1]
 36 | 
 37 | 		# Merging both structures into one list skipping headers
 38 | 		perts_and_detections = list(map(list, zip(df['Perturbations'][0:], detections[0:])))
 39 | 
 40 | 	else:
 41 | 		perts_and_detections = list(df['Perturbations'][1:])
 42 | 
 43 | 	# Retrieving detections ratio for original file
 44 | 	benchmark = df['OF_Detections'].values[0]
 45 | 
 46 | 	return perts_and_detections, benchmark[:2]
 47 | 
 48 | 
 49 | def accumulative_counter(perts_and_detections):
 50 | 	keys = map(str, list(range(26)))[1:]
 51 | 	accumulative_dict = {key: 0 for key in keys}
 52 | 	counter_dict = {key: 0 for key in keys}
 53 | 
 54 | 	val1, val2 = perts_and_detections[:2]
 55 | 	if val1 != val2:  # Check whether database.csv (successes) or fail_database.csv (fails) is handed
 56 | 		for i in range(len(perts_and_detections)):
 57 | 			accumulative_dict[perts_and_detections[i][0]] = accumulative_dict[perts_and_detections[i][0]] + \
 58 | 															int(perts_and_detections[i][1])
 59 | 			counter_dict[perts_and_detections[i][0]] = counter_dict[perts_and_detections[i][0]] + 1
 60 | 	else:
 61 | 		c = Counter(perts_and_detections)
 62 | 		c = {int(k): int(v) for k, v in c.items()}
 63 | 		counter_dict = dict(sorted(c.items()))
 64 | 
 65 | 	# Removing keys with zero values to avoid ZeroDivisionError
 66 | 	accumulative_dict = {k: v for k, v in accumulative_dict.items() if v != 0}
 67 | 	counter_dict = {k: v for k, v in counter_dict.items() if v != 0}
 68 | 
 69 | 	return accumulative_dict, counter_dict, len(keys)
 70 | 
 71 | 
 72 | def string_to_int_list(perts_and_detections):
 73 | 	# Converting str list into int list
 74 | 	list_perts = [int(a) for a, b in perts_and_detections]
 75 | 	list_detections = [int(b) for a, b in perts_and_detections]
 76 | 	new_list = sorted(list(map(list, zip(list_perts, list_detections))))
 77 | 
 78 | 	return new_list
 79 | 
 80 | 
 81 | def det_vs_pert(CSV, sample):
 82 | 	# Converting CSV into list of list	
 83 | 	perts_and_detections, benchmark = csv_into_list(CSV, sample)
 84 | 
 85 | 	# Calculating accumulator and counter of detections based on perturbations injected
 86 | 	accumulative_dict, counter_dict, len_keys = accumulative_counter(perts_and_detections)
 87 | 	print('Number of samples per injection:\n{}'.format(counter_dict))
 88 | 
 89 | 	# Building final dict with perturbations as key and average of detections as values & then sort
 90 | 	avg_dict = {int(k): round(accumulative_dict[k] / counter_dict[k]) for k in accumulative_dict.keys() & counter_dict}
 91 | 	avg_dict = dict(sorted(avg_dict.items()))
 92 | 
 93 | 	# Defining x & y
 94 | 	x = list(avg_dict.keys())
 95 | 	y = list(avg_dict.values())
 96 | 
 97 | 	# Plot ARMED's new mutations performance 
 98 | 	plt.figure()
 99 | 	ax = plt.gca()
100 | 	plt.plot(x[:len_keys], y[:len_keys], c='b', label='Average')  # 'Mutations')
101 | 
102 | 	# Formatting 
103 | 	ax.set_xlim(2, len_keys)
104 | 	ax.set_ylim(0, 57)
105 | 	# ax.xaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))
106 | 	# ax.set_title("ARMED: Average Detections of New Mutations [n={}]".format(len(perts_and_detections)))
107 | 	ax.set_xlabel('Perturbations')
108 | 	ax.set_ylabel('Average of VirusTotal Detections')
109 | 	plt.hlines(y=int(benchmark), colors='r', xmin=0, xmax=len_keys, linestyles='dashed')  # , label='Original file')
110 | 	plt.legend(loc=1)
111 | 	plt.savefig('graphics/' + sample + '/VTEvsPI.png')
112 | 
113 | 
114 | def scatter_plot(CSV, sample):
115 | 	# Converting CSV into list of list	
116 | 	perts_and_detections, benchmark = csv_into_list(CSV, sample)
117 | 
118 | 	# Converting str list into int list
119 | 	new_list = string_to_int_list(perts_and_detections)
120 | 
121 | 	# Defining x, y and N
122 | 	x = [int(a) for a, b in new_list]
123 | 	y = [int(b) for a, b in new_list]
124 | 	area = 10
125 | 
126 | 	# Plot each mutated sample in ARMED database
127 | 	ax = plt.gca()
128 | 	plt.scatter(x[:300], y[:300], s=area, c='black', alpha=0.5, label="Mutations (S')")
129 | 	plt.hlines(y=int(benchmark), colors='r', xmin=0, xmax=22, linestyles='dashed', label="Original (S)")
130 | 
131 | 	# General formatting
132 | 	ax.set_xlim(1.9, 25.1)
133 | 	# ax.xaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))
134 | 	ax.set_ylim(0, 57)
135 | 	# ax.set_title("ARMED: Distribution of Mutations [n={}]".format(N))
136 | 	ax.set_xlabel('Number of perturbations injected')
137 | 	ax.set_ylabel('Number of detection engines')  # (max = 68)')
138 | 	ax.legend(loc='upper center', bbox_to_anchor=(0.576, 1.01), fancybox=False, shadow=False, ncol=5)
139 | 	plt.savefig('graphics/' + sample + '/scatter_plot.png')
140 | 
141 | 
142 | def ratio_functional(CSV, CSV_fail, sample):
143 | 	# Converting CSV into list of list
144 | 	perts_and_detections, benchmark = csv_into_list(CSV, sample)
145 | 	perts_fail, benchmark_fail = csv_into_list(CSV_fail, sample)
146 | 
147 | 	# Counting detections of manipulated sample based on perturbations injected
148 | 	_, counter_dict, len_keys = accumulative_counter(perts_and_detections)
149 | 	_, counter_dict_fail, _ = accumulative_counter(perts_fail)
150 | 	# print('Number of samples per injection: (fail database)\n{}'.format(counter_dict_fail))
151 | 
152 | 	# Defining y vars
153 | 	y = list(counter_dict.values())
154 | 	y_fail = list(counter_dict_fail.values())
155 | 
156 | 	# Plot ration of functional vs. non-functional
157 | 	plt.figure()
158 | 
159 | 	# Values of each bar
160 | 	bars_s = y[:len_keys]
161 | 	bars_f = y_fail[:len_keys]
162 | 	sum_y = sum(y_fail[:len_keys]) + sum(y[:len_keys])
163 | 
164 | 	# Check 10 perts were injected and all p > 10
165 | 	if len(bars_s) < 10 or y[0] < 10:
166 | 		print('Not enough data for bar plot yet')
167 | 		quit()
168 | 
169 | 	# Position of bars on x-axis
170 | 	r = list(range(24))
171 | 
172 | 	# Names of group and bar width
173 | 	names = map(str, list(range(26)))[2:]
174 | 	barWidth = 0.85
175 | 
176 | 	# Create successful & failed mutations bars
177 | 	plt.bar(r, bars_s, color='darkgray', edgecolor='white', width=barWidth, label='Functional')
178 | 	plt.bar(r, bars_f, bottom=bars_s, color='gray', edgecolor='white', width=barWidth, label='Non-functional')
179 | 
180 | 	# Formatting
181 | 	# plt.title("ARMED: Functional vs. Non-functional Mutations [n={}]".format(sum_y))
182 | 	plt.hlines(y=sum_y / len(y[:len_keys]), colors='r', xmin=0, xmax=len_keys - 2, linestyles='dashed', label='Average')
183 | 	plt.ylim(0, max(y_fail) + y[0] + 5)
184 | 	plt.xticks(r, names)
185 | 	plt.xlabel("Number of perturbations injected")
186 | 	plt.ylabel("Number of mutations generated")
187 | 	plt.legend(loc=2)
188 | 	plt.savefig('graphics/' + sample + '/ratio_functional.png')
189 | 
190 | 
191 | if __name__ == '__main__':
192 | 	det_vs_pert('db/database.csv', 'original/keylogger')
193 | 	scatter_plot('db/database.csv', 'original/keylogger')
194 | 	ratio_functional('db/database.csv', 'db/fail_database.csv', 'original/keylogger')
195 | 


--------------------------------------------------------------------------------
/data/manipulate.py:
--------------------------------------------------------------------------------
  1 | # Source: https://github.com/endgameinc/gym-malware
  2 | 
  3 | import lief
  4 | import json
  5 | import os
  6 | import sys
  7 | import array
  8 | import struct  # byte manipulations
  9 | import random
 10 | import tempfile
 11 | import subprocess
 12 | import functools
 13 | import signal
 14 | import multiprocessing
 15 | 
 16 | MODULE_PATH = os.path.split(os.path.abspath(sys.modules[__name__].__file__))[0]
 17 | 
 18 | COMMON_SECTION_NAMES = open(os.path.join(
 19 |     MODULE_PATH, 'section_names.txt'), 'r').read().rstrip().split('\n')
 20 | 
 21 | COMMON_IMPORTS = json.load(
 22 |     open(os.path.join(MODULE_PATH, 'small_dll_imports.json'), 'r'))
 23 | 
 24 | 
 25 | class MalwareManipulator(object):
 26 |     def __init__(self, bytez):
 27 |         self.bytez = bytez
 28 |         self.min_append_log2 = 5
 29 |         self.max_append_log2 = 8
 30 | 
 31 |     def __random_length(self):
 32 |         return 2**random.randint(self.min_append_log2, self.max_append_log2)
 33 | 
 34 |     def __binary_to_bytez(self, binary, dos_stub=False, imports=False, overlay=False, relocations=False, resources=False, tls=False):
 35 |         builder = lief.PE.Builder(binary) # write the file back as bytez
 36 |         if(dos_stub):
 37 |             builder.build_dos_stub(dos_stub) # rebuild DOS stub
 38 |         if(imports):
 39 |             builder.build_imports(imports) # rebuild IAT in another section
 40 |             builder.patch_imports(imports) # patch orig. import table with trampolines to new import table
 41 |         if(overlay):
 42 |             builder.build_overlay(overlay) # rebuild overlay
 43 |         if(relocations):
 44 |             builder.build_relocations(relocations) # rebuild relocation table in another section
 45 |         if(resources):
 46 |             builder.build_resources(resources) # rebuild resources in another section
 47 |         if(tls):
 48 |             builder.build_tls(tls) # rebuilt TLS object in another section
 49 |         builder.build() # perform the build process
 50 |         return array.array('B', builder.get_build()).tobytes()
 51 | 
 52 |     def overlay_append(self, seed=None):
 53 |         random.seed(seed)
 54 |         L = self.__random_length()
 55 |         # choose the upper bound for a uniform distribution in [0,upper]
 56 |         upper = random.randrange(256)
 57 |         # upper chooses the upper bound on uniform distribution:
 58 |         # upper=0 would append with all 0s
 59 |         # upper=126 would append with "printable ascii"
 60 |         # upper=255 would append with any character
 61 |         return self.bytez + bytes([random.randint(0, upper) for _ in range(L)])
 62 | 
 63 |     def imports_append(self, seed=None):
 64 |         # add (unused) imports
 65 |         random.seed(seed)
 66 |         binary = lief.PE.parse(list(self.bytez))
 67 |         # draw a library at random
 68 |         libname = random.choice(list(COMMON_IMPORTS.keys()))
 69 |         funcname = random.choice(list(COMMON_IMPORTS[libname]))
 70 |         lowerlibname = libname.lower()
 71 |         # find this lib in the imports, if it exists
 72 |         lib = None
 73 |         for im in binary.imports:
 74 |             if im.name.lower() == lowerlibname:
 75 |                 lib = im
 76 |                 break
 77 |         if lib is None:
 78 |             # add a new library
 79 |             lib = binary.add_library(libname)
 80 |         # get current names
 81 |         names = set([e.name for e in lib.entries])
 82 |         if not funcname in names:
 83 |             lib.add_entry(funcname)
 84 | 
 85 |         self.bytez = self.__binary_to_bytez(binary,imports=True)
 86 | 
 87 |         return self.bytez
 88 | 
 89 |     def section_rename(self, seed=None):
 90 |         # rename a random section
 91 |         random.seed(seed)
 92 |         binary = lief.PE.parse(list(self.bytez))
 93 |         targeted_section = random.choice(binary.sections)
 94 |         targeted_section.name = random.choice(COMMON_SECTION_NAMES)[:7] #actual version of lief not allowing 8 chars?
 95 | 
 96 |         self.bytez = self.__binary_to_bytez(binary)
 97 | 
 98 |         return self.bytez
 99 | 
100 |     def section_add(self, seed=None):
101 |         random.seed(seed)
102 |         binary = lief.PE.parse(list(self.bytez))
103 |         new_section = lief.PE.Section(
104 |             "".join(chr(random.randrange(ord('.'), ord('z'))) for _ in range(6)))
105 | 
106 |         # fill with random content
107 |         upper = random.randrange(256)
108 |         L = self.__random_length()
109 |         new_section.content = [random.randint(0, upper) for _ in range(L)]
110 | 
111 |         new_section.virtual_address = max(
112 |             [s.virtual_address + s.size for s in binary.sections])
113 |         # add a new empty section
114 | 
115 |         binary.add_section(new_section,
116 |                            random.choice([
117 |                                lief.PE.SECTION_TYPES.BSS,
118 |                                lief.PE.SECTION_TYPES.DATA,
119 |                                lief.PE.SECTION_TYPES.EXPORT,
120 |                                lief.PE.SECTION_TYPES.IDATA,
121 |                                lief.PE.SECTION_TYPES.RELOCATION,
122 |                                lief.PE.SECTION_TYPES.RESOURCE,
123 |                                lief.PE.SECTION_TYPES.TEXT,
124 |                                lief.PE.SECTION_TYPES.TLS_,
125 |                                lief.PE.SECTION_TYPES.UNKNOWN,
126 |                            ]))
127 | 
128 |         self.bytez = self.__binary_to_bytez(binary)
129 |         return self.bytez
130 | 
131 |     def section_append(self, seed=None):
132 |         # append to a section (changes size and entropy)
133 |         random.seed(seed)
134 |         binary = lief.PE.parse(list(self.bytez))
135 |         targeted_section = random.choice(binary.sections)
136 |         L = self.__random_length()
137 |         available_size = targeted_section.size - len(targeted_section.content)
138 |         if L > available_size:
139 |             L = available_size
140 | 
141 |         upper = random.randrange(256)
142 |         targeted_section.content = targeted_section.content + \
143 |             [random.randint(0, upper) for _ in range(L)]
144 | 
145 |         self.bytez = self.__binary_to_bytez(binary)
146 |         return self.bytez
147 | 
148 |     # def section_reorder(self,param,seed=None):
149 |     #   # reorder directory of sections
150 |     #   pass
151 | 
152 |     def create_new_entry(self, seed=None):
153 |         # create a new section with jump to old entry point, and change entry point
154 |         # DRAFT: this may have a few technical issues with it (not accounting for relocations), but is a proof of concept for functionality
155 |         random.seed(seed)
156 | 
157 |         binary = lief.PE.parse(list(self.bytez))
158 | 
159 |         # get entry point
160 |         entry_point = binary.optional_header.addressof_entrypoint
161 | 
162 |         # get name of section
163 |         entryname = binary.section_from_rva(entry_point).name
164 | 
165 |         # create a new section
166 |         new_section = lief.PE.Section(entryname + "".join(chr(random.randrange(
167 |             ord('.'), ord('z'))) for _ in range(3)))  # e.g., ".text" + 3 random characters
168 |         # push [old_entry_point]; ret
169 |         new_section.content = [
170 |             0x68] + list(struct.pack("<I", entry_point + 0x10000)) + [0xc3]
171 |         new_section.virtual_address = max(
172 |             [s.virtual_address + s.size for s in binary.sections])
173 |         # TO DO: account for base relocation (this is just a proof of concepts)
174 | 
175 |         # add new section
176 |         binary.add_section(new_section, lief.PE.SECTION_TYPES.TEXT)
177 | 
178 |         # redirect entry point
179 |         binary.optional_header.addressof_entrypoint = new_section.virtual_address
180 | 
181 |         self.bytez = self.__binary_to_bytez(binary)
182 |         return self.bytez
183 | 
184 |     def upx_pack(self, seed=None):
185 |         # tested with UPX 3.91
186 |         random.seed(seed)
187 |         tmpfilename = os.path.join(
188 |             tempfile._get_default_tempdir(), next(tempfile._get_candidate_names()))
189 | 
190 |         # dump bytez to a temporary file
191 |         with open(tmpfilename, 'wb') as outfile:
192 |             outfile.write(self.bytez)
193 | 
194 |         options = ['--force', '--overlay=copy']
195 |         compression_level = random.randint(1, 9)
196 |         options += ['-{}'.format(compression_level)]
197 |         # --exact
198 |         # compression levels -1 to -9
199 |         # --overlay=copy [default]
200 | 
201 |         # optional things:
202 |         # --compress-exports=0/1
203 |         # --compress-icons=0/1/2/3
204 |         # --compress-resources=0/1
205 |         # --strip-relocs=0/1
206 |         options += ['--compress-exports={}'.format(random.randint(0, 1))]
207 |         options += ['--compress-icons={}'.format(random.randint(0, 3))]
208 |         options += ['--compress-resources={}'.format(random.randint(0, 1))]
209 |         options += ['--strip-relocs={}'.format(random.randint(0, 1))]
210 | 
211 |         with open(os.devnull, 'w') as DEVNULL:
212 |             retcode = subprocess.call(
213 |                 ['upx'] + options + [tmpfilename, '-o', tmpfilename + '_packed'], stdout=DEVNULL, stderr=DEVNULL)
214 | 
215 |         os.unlink(tmpfilename)
216 | 
217 |         if retcode == 0:  # successfully packed
218 | 
219 |             with open(tmpfilename + '_packed', 'rb') as infile:
220 |                 self.bytez = infile.read()
221 | 
222 |             os.unlink(tmpfilename + '_packed')
223 | 
224 |         return self.bytez
225 | 
226 |     def upx_unpack(self, seed=None):
227 |         # dump bytez to a temporary file
228 |         tmpfilename = os.path.join(
229 |             tempfile._get_default_tempdir(), next(tempfile._get_candidate_names()))
230 | 
231 |         with open(tmpfilename, 'wb') as outfile:
232 |             outfile.write(self.bytez)
233 | 
234 |         with open(os.devnull, 'w') as DEVNULL:
235 |             retcode = subprocess.call(
236 |                 ['upx', tmpfilename, '-d', '-o', tmpfilename + '_unpacked'], stdout=DEVNULL, stderr=DEVNULL)
237 | 
238 |         os.unlink(tmpfilename)
239 | 
240 |         if retcode == 0:  # sucessfully unpacked
241 |             with open(tmpfilename + '_unpacked', 'rb') as result:
242 |                 self.bytez = result.read()
243 | 
244 |             os.unlink(tmpfilename + '_unpacked')
245 | 
246 |         return self.bytez
247 | 
248 |     def remove_signature(self, seed=None):
249 |         random.seed(seed)
250 |         binary = lief.PE.parse(list(self.bytez))
251 | 
252 |         if binary.has_signature:
253 |             for i, e in enumerate(binary.data_directories):
254 |                 if e.type == lief.PE.DATA_DIRECTORY.CERTIFICATE_TABLE:
255 |                     break
256 |             if e.type == lief.PE.DATA_DIRECTORY.CERTIFICATE_TABLE:
257 |                 # remove signature from certificate table
258 |                 e.rva = 0
259 |                 e.size = 0
260 |                 self.bytez = self.__binary_to_bytez(binary)
261 |                 return self.bytez
262 |         # if no signature found, self.bytez is unmodified
263 |         return self.bytez
264 | 
265 |     def remove_debug(self, seed=None):
266 |         random.seed(seed)
267 |         binary = lief.PE.parse(list(self.bytez))
268 | 
269 |         if binary.has_debug:
270 |             for i, e in enumerate(binary.data_directories):
271 |                 if e.type == lief.PE.DATA_DIRECTORY.DEBUG:
272 |                     break
273 |             if e.type == lief.PE.DATA_DIRECTORY.DEBUG:
274 |                 # remove signature from certificate table
275 |                 e.rva = 0
276 |                 e.size = 0
277 |                 self.bytez = self.__binary_to_bytez(binary)
278 |                 return self.bytez
279 |         # if no signature found, self.bytez is unmodified
280 |         return self.bytez
281 | 
282 |     def break_optional_header_checksum(self, seed=None):
283 |         binary = lief.PE.parse(list(self.bytez))
284 |         binary.optional_header.checksum = 0
285 |         self.bytez = self.__binary_to_bytez(binary)
286 |         return self.bytez
287 | 
288 | # List of actions
289 | ACTION_TABLE = {
290 |     'overlay_append': 'overlay_append',     # 0
291 |     'imports_append': 'imports_append',     # 1
292 |     'section_rename': 'section_rename',     # 2
293 |     'section_add': 'section_add',           # 3
294 |     'section_append': 'section_append',     # 4
295 |     'remove_signature': 'remove_signature', # 5
296 |     'remove_debug': 'remove_debug',         # 6
297 |     'upx_pack': 'upx_pack',                 # 7
298 |     'upx_unpack': 'upx_unpack',             # 8
299 |     'break_optional_header_checksum': 'break_optional_header_checksum'  # 9
300 | #   'create_new_entry': 'create_new_entry', # generates often entry point errors
301 | }
302 | 


--------------------------------------------------------------------------------
/src/gp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Genetic Programming implementation:
  4 | # Inspired on https://github.com/lowerkey/genetic_programming
  5 | 
  6 | # Use numpy.random instead of random.random() to leverage the Mersenne Twister implementation
  7 | # to generate pseudorandom numbers: http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/ARTICLES/mt.pdf
  8 | 
  9 | # Estimated processing: (generations-1)*2 + size_population * func_test(mutation) * detect(mutation)
 10 | 
 11 | import os
 12 | import math
 13 | import operator
 14 | import numpy as np
 15 | import src.config as cfg
 16 | from tqdm import tqdm
 17 | import src.functions as f
 18 | import src.implementation as i
 19 | 
 20 | EXPLORATION_SET = cfg.file['paths']['exploration']
 21 | 
 22 | 
 23 | class Chromosome:
 24 | 
 25 | 	def __init__(self, code, length_sequence):
 26 | 		self.cost = 0
 27 | 		self.code = code
 28 | 		self.length_sequence = length_sequence
 29 | 
 30 | 	def __getitem__(self, index):
 31 | 		return self.code[index]
 32 | 
 33 | 	def __setitem__(self, index, value):
 34 | 		self.code[index] = value
 35 | 
 36 | 	def mate(self, chromosome):
 37 | 
 38 | 		""" Perform crossover between two genes """
 39 | 
 40 | 		middle = int(math.floor(len(self.code) / 2))
 41 | 		return [Chromosome(self.code[:middle] + chromosome.code[middle:], len(self.code)),
 42 | 				Chromosome(chromosome.code[:middle] + self.code[middle:], len(self.code))]
 43 | 
 44 | 	def mutate(self, chance):
 45 | 
 46 | 		""" Random genetic mutation on genes """
 47 | 
 48 | 		if np.random.random() < chance:
 49 | 			return
 50 | 		else:
 51 | 			index = int(np.random.random() * len(self.code))
 52 | 			self.code[index] = int(np.random.random() * self.length_sequence)
 53 | 
 54 | 	def random(self, length):
 55 | 
 56 | 		""" Generate random genes """
 57 | 
 58 | 		code = []
 59 | 		for _ in range(length):
 60 | 			code.append(int(np.random.random() * self.length_sequence))
 61 | 		self.code = code
 62 | 
 63 | 	def calcCost(self, detected, generation, diff, size_dir=100, conf_rate=0, search_UAP=False):
 64 | 
 65 | 		""" Calculate the cost of each sample state: corrupt, detected, and evasive """
 66 | 
 67 | 		# Black-box attacks
 68 | 		if search_UAP and not conf_rate:
 69 | 
 70 | 			# status == 'corrupt'
 71 | 			if detected == '':
 72 | 				self.cost += (10 + generation + diff) / size_dir
 73 | 			# status = 'detected'
 74 | 			elif detected:
 75 | 				self.cost += (50 + generation + diff) / size_dir
 76 | 			# status = 'evasion'
 77 | 			elif not detected:
 78 | 				self.cost += (1000 + generation + diff) / size_dir  # 1000 because it needs to make a difference
 79 | 
 80 | 		# Gray-box attacks (using only confidence rate of classifier)
 81 | 		elif search_UAP and conf_rate:
 82 | 			self.cost += 1 - conf_rate
 83 | 
 84 | 		# "Input-specific" black-box attack
 85 | 		else:
 86 | 			# status == 'corrupt'
 87 | 			if detected == '':
 88 | 				self.cost += 10 + generation + diff
 89 | 			# status = 'detected'
 90 | 			elif detected:
 91 | 				self.cost += 50 + generation + diff
 92 | 			# status = 'evasion'
 93 | 			elif not detected:
 94 | 				self.cost += 100 + generation + diff
 95 | 
 96 | 
 97 | class Population:
 98 | 
 99 | 	def __init__(self, size, length_sequence):
100 | 		self.members = []
101 | 		self.mutations_processed = []
102 | 		self.potential_uap = []
103 | 		self.length_sequence = length_sequence
104 | 		self.new_evasions = 0
105 | 		self.corrupt_mutations = 0
106 | 		self.diff_samples = 0
107 | 		self.rounds = cfg.file.getint('aimed', 'rounds')
108 | 		self.size_population = size
109 | 		for _ in range(size):
110 | 			chromosome = Chromosome(code='', length_sequence=length_sequence)
111 | 			chromosome.random(self.length_sequence)
112 | 			self.members.append(chromosome)
113 | 		self.generationNumber = 1
114 | 
115 | 	def calcCosts(self, detected, generation, diff):
116 | 		for member in self.members:
117 | 			member.calcCost(detected, generation, diff)
118 | 
119 | 	def mutate(self, chance):
120 | 		for member in self.members:
121 | 			member.mutate(chance)
122 | 
123 | 	def breed(self):
124 | 		middle = int(math.floor(self.size_population / 2))
125 | 		for idx in range(0, middle - 1, 2):
126 | 			children = self.members[idx].mate(self.members[idx + 1])
127 | 			children[0].mutate(0.1)
128 | 			children[1].mutate(0.1)
129 | 			self.members[middle + idx] = children[0]
130 | 			self.members[middle + idx + 1] = children[1]
131 | 
132 | 	def selection(self):
133 | 
134 | 		""" Select the fittest members for the next generation """
135 | 
136 | 		print("\n### Generation {} ###".format(self.generationNumber))
137 | 
138 | 		# Sort cost descending to group highest fitness at the beginning of the list
139 | 		self.members = sorted(self.members, key=lambda member: member.cost, reverse=True)
140 | 
141 | 		# If genes are equal & there are different fit genes on the list, swap them
142 | 		for elem in range(self.size_population - 1):
143 | 			if self.members[elem].code == self.members[elem + 1].code:  # and self.members[elem].cost >= 100:
144 | 				for z in range(2, self.size_population - 1):
145 | 					if self.members[z].code != self.members[elem].code and self.members[z].code != self.members[elem+1].code:
146 | 						self.members[elem] = self.members[z]
147 | 						break
148 | 
149 | 		# Show updated population
150 | 		print('\n# Population: ', end='')
151 | 		[print(self.members[s].code, round(self.members[s].cost, 4), end=' # ') for s in range(len(self.members))]
152 | 		print('\n')
153 | 
154 | 	def listEvasions(self):
155 | 
156 | 		""" Show evasive members """
157 | 
158 | 		sequence_list = []
159 | 		[sequence_list.append(sequence) for sequence in self.mutations_processed if
160 | 		 sequence[2] > 0 and sequence[0] not in sequence_list]
161 | 		return sequence_list
162 | 
163 | 	def allEvasion(self):
164 | 
165 | 		""" Check whether all members are evasive """
166 | 
167 | 		duplicates = []
168 | 		if self.members[0].cost < 100:
169 | 			return False
170 | 		for z in range(len(self.members) - 1):
171 | 			if self.members[z].cost == self.members[z + 1].cost:
172 | 				pass
173 | 			else:
174 | 				return False
175 | 
176 | 		# Create a list with only member.code to make it hashable
177 | 		for k in self.members:
178 | 			duplicates.append(k.code)
179 | 
180 | 		# Make sure there are no duplicated genes in the population
181 | 		if len(set(map(tuple, duplicates))) == len(self.members):
182 | 			print('\nAll sequences in the population lead to evasive mutations!')
183 | 			print('\nPopulation: ', end='')
184 | 			[print(self.members[z].code, self.members[z].cost, end=' # ') for z in range(len(self.members))]
185 | 			return True
186 | 
187 | 	def generation(self, file, actions, search_uap=False):
188 | 
189 | 		# Run until termination criteria are met
190 | 		if search_uap:
191 | 			while not self._generation_uap(actions):
192 | 				pass
193 | 		else:
194 | 			while not self._generation(file, actions):
195 | 				pass
196 | 
197 | 		# Once finished, show evasive sequences if any sorted by most evasive
198 | 		if self.new_evasions:
199 | 			list_evasions = sorted(self.listEvasions(), key=operator.itemgetter(2), reverse=True)
200 | 			number_fittest_evasions = math.floor(len(list_evasions) / 10)
201 | 			print('\nAll evasive sequences found: {}\n'.format(len(list_evasions)))
202 | 			print('Displaying only 10% of fittest evasions:')
203 | 			for seq in range(number_fittest_evasions):
204 | 				print('Sequence: {} -- Fitness: {} -- Evasions: {}'.format(list_evasions[seq][0],
205 | 															   round(list_evasions[seq][1], 2), list_evasions[seq][2]))
206 | 			return list_evasions
207 | 		else:
208 | 			print('No evasive sequences found.')
209 | 
210 | 		return 0
211 | 
212 | 	def _generation(self, sample, actions):
213 | 
214 | 		# Set UseVT to VirusTotal report
215 | 		useVT = cfg.file.getboolean('remote', 'useVT')
216 | 
217 | 		# Call selection before breeding
218 | 		self.selection()
219 | 
220 | 		# Breeding & mutating and adding children to the members list for Selection afterwards
221 | 		self.breed()
222 | 
223 | 		gene_num = 0
224 | 		scanner = cfg.file['aimed']['model']
225 | 		for member in self.members:
226 | 			existing_member = False
227 | 
228 | 			# If mutation was processed retrieve fitness value & avoid processing again
229 | 			for x in range(len(self.mutations_processed)):
230 | 				if self.mutations_processed[x][0] == member.code:
231 | 					member.cost = self.mutations_processed[x][1]
232 | 					# print('\nFitness: {}'.format(member.cost))
233 | 					existing_member = True
234 | 					break
235 | 
236 | 			evasion = 0
237 | 			if not existing_member:
238 | 
239 | 				# First generation calculates all genes, then breeds+mutates 2 members per generation
240 | 				gene_num += 1
241 | 				if self.generationNumber == 1:
242 | 					print('# Calculating fitness for gene {} of {}: {} #'.format(gene_num, len(self.members),
243 | 																				 member.code))
244 | 				else:
245 | 					print('# Calculating fitness for child {}: {} #'.format(gene_num, member.code))
246 | 
247 | 				# Inject children sequences to input object to create four adversarial examples
248 | 				bin_bytes = f.readfile(sample)
249 | 				mod_sample = f.rec_mod_files(bin_bytes, actions, member.code, len(member.code) - 1)
250 | 
251 | 				# If adversarial file returns errors, terminate in current generation
252 | 				if not mod_sample:
253 | 					return True
254 | 
255 | 				# Collect info to writeCSV function
256 | 				mod_sample_hash = f.hash_files(mod_sample)
257 | 				sample_report = {'positives': 1, 'total': 1}
258 | 				CSV = f.collect_info_CSV(sample, sample_report, len(member.code), member.code,
259 | 										 mod_sample_hash, f.hash_files(sample))
260 | 
261 | 				# Analyze functionality results
262 | 				if cfg.file.getboolean('aimed', 'integrityCheck'):
263 | 					funcional, url_sandbox = i.malware_analysis(mod_sample, useVT, CSV)
264 | 				else:
265 | 					# When f.batch_functionality_test() is used instead of online verification
266 | 					funcional, url_sandbox = True, "www.no_integrity_test.com"
267 | 
268 | 				#  Analyze detection results
269 | 				if funcional:
270 | 					# print('Running detection for gene:', member.code)
271 | 					detected, _ = i.malware_detection(mod_sample, scanner)
272 | 					mutation_name = str(len(member.code)) + '_m.exe'
273 | 					evasion = i.save_file_database(detected, mutation_name, url_sandbox, CSV, scanner)
274 | 					self.new_evasions += evasion
275 | 
276 | 					# Calculate difference between original sample and mutation
277 | 					self.diff_samples = f.get_difference(sample, mod_sample)
278 | 					diff_adjusted = round(self.diff_samples / 100000, 3)  # Constant empirically defined
279 | 
280 | 					# Set cost to adversarial instances
281 | 					member.calcCost(detected, self.generationNumber, diff_adjusted)
282 | 				else:
283 | 					# Send empty when corrupt
284 | 					member.calcCost('', self.generationNumber, 0)
285 | 					self.corrupt_mutations += 1
286 | 
287 | 				self.mutations_processed.append([member.code, member.cost, evasion])
288 | 
289 | 				print('Sequence: {} – Fitness: {}\n'.format(member.code, member.cost))
290 | 
291 | 		if self.new_evasions:
292 | 			print('# Evasive mutations found: {} #'.format(self.new_evasions))
293 | 			print('# Corrupt mutations found: {} #\n'.format(self.corrupt_mutations))
294 | 
295 | 		# Termination: number of evasions achieved or number of generations reach termination defined
296 | 		files_expected = cfg.file.getint('aimed', 'advFilesExpected')
297 | 		termination_per_generation = files_expected ** 2 if files_expected >= 10 else self.rounds
298 | 		if self.generationNumber == termination_per_generation:  # self.new_evasions >= files_expected or
299 | 			return True
300 | 
301 | 		self.generationNumber += 1
302 | 		return False
303 | 
304 | 	def _generation_uap(self, actions):
305 | 
306 | 		# Set UseVT to VirusTotal report
307 | 		useVT = cfg.file.getboolean('remote', 'useVT')
308 | 
309 | 		# Call selection before breeding
310 | 		self.selection()
311 | 
312 | 		# Breeding & mutating and adding children to the members list for Selection afterwards
313 | 		self.breed()
314 | 
315 | 		# Calculate size of directory
316 | 		files_exp_set = os.listdir(EXPLORATION_SET)
317 | 		size_exp_set = len(files_exp_set)
318 | 
319 | 		gene_num = 0
320 | 		scanner = cfg.file['aimed']['model']
321 | 		for member in self.members:
322 | 			existing_member = False
323 | 
324 | 			# If mutation was processed retrieve fitness value & avoid processing again
325 | 			for x in range(len(self.mutations_processed)):
326 | 				if self.mutations_processed[x][0] == member.code:
327 | 					member.cost = self.mutations_processed[x][1]
328 | 					# print('\nFitness: {}'.format(member.cost))
329 | 					existing_member = True
330 | 					break
331 | 
332 | 			if not existing_member:
333 | 
334 | 				# First generation calculates all genes, then breeds+mutates 2 members per generation
335 | 				gene_num += 1
336 | 				if self.generationNumber == 1:
337 | 					print('# Calculating fitness for gene {} of {}: {} #'.format(gene_num, len(self.members),
338 | 																				 member.code))
339 | 				else:
340 | 					print('# Calculating fitness for child {}: {} #'.format(gene_num, member.code))
341 | 
342 | 				# Picking sequentially each file from source folder
343 | 				current_file = 1
344 | 				evasions_in_generation = 0
345 | 				for each_sample in tqdm(sorted(os.listdir(EXPLORATION_SET))):
346 | 
347 | 					# Convert selected sample into binaries
348 | 					sample = os.path.join(EXPLORATION_SET, each_sample)
349 | 					bin_bytes = f.readfile(sample)
350 | 
351 | 					# Inject children sequences to input file to create four adversarial examples
352 | 					mod_sample = f.rec_mod_files(bin_bytes, actions, member.code, len(member.code) - 1)
353 | 
354 | 					# If adversarial example returns errors, terminate in current generation
355 | 					if not mod_sample:
356 | 						os.rename(os.path.join(EXPLORATION_SET, each_sample), EXPLORATION_SET + 'LIEF_Error_' + each_sample)
357 | 						return True
358 | 
359 | 					# Collect info to writeCSV function
360 | 					mod_sample_hash = f.hash_files(mod_sample)
361 | 					sample_report = {'positives': 1, 'total': 1}
362 | 					CSV = f.collect_info_CSV(sample, sample_report, len(member.code), member.code,
363 | 											 mod_sample_hash, f.hash_files(sample))
364 | 
365 | 					# Analyze functionality results
366 | 					if cfg.file.getboolean('aimed', 'integrityCheck'):
367 | 						funcional, url_sandbox = i.malware_analysis(mod_sample, useVT, CSV)
368 | 					else:
369 | 						# When f.batch_functionality_test() is used instead of online verification
370 | 						funcional, url_sandbox = True, "www.no_integrity_test.com"
371 | 
372 | 					#  Analyze detection results
373 | 					if funcional:
374 | 						# print('Running detection for gene:', member.code)
375 | 						detected, score = i.malware_detection(mod_sample, scanner, verbose=False)
376 | 						mutation_name = str(len(member.code)) + '_m.exe'
377 | 						self.new_evasions += i.save_file_database(detected, mutation_name, url_sandbox, CSV, scanner,
378 | 																  verbose=False)
379 | 
380 | 						# Calculate difference between original sample and mutation
381 | 						self.diff_samples = f.get_difference(sample, mod_sample)
382 | 						diff_adjusted = round(self.diff_samples / 100000, 3)  # Constant empirically defined
383 | 
384 | 						# Set cost to adversarial instances
385 | 						member.calcCost(detected, self.generationNumber, diff_adjusted, size_dir=size_exp_set,
386 | 										conf_rate=score, search_UAP=True)
387 | 
388 | 						if not detected:
389 | 							evasions_in_generation += 1
390 | 					else:
391 | 						# Send empty when corrupt
392 | 						member.calcCost('', self.generationNumber, 0, size_dir=size_exp_set, search_UAP=True)
393 | 						self.corrupt_mutations += 1
394 | 
395 | 					current_file += 1
396 | 
397 | 				# Check if member has potential to be UAP
398 | 				if evasions_in_generation >= 20:
399 | 					self.potential_uap.append([member.code, member.cost, evasions_in_generation])
400 | 
401 | 				self.mutations_processed.append([member.code, member.cost, evasions_in_generation])
402 | 
403 | 				print('\nSequence: {} – Fitness: {} - Evasions: {}\n'.format(member.code, round(member.cost, 4),
404 | 																			 evasions_in_generation))
405 | 
406 | 		if self.potential_uap:
407 | 			print('# Potential UAP candidates found: {} #'.format(len(self.potential_uap)))
408 | 
409 | 		# Termination: number of evasions achieved or number of generations reach termination defined
410 | 		files_expected = cfg.file.getint('aimed', 'advFilesExpected')
411 | 		termination_per_generation = files_expected ** 2 if files_expected >= 10 else self.rounds
412 | 		if self.generationNumber == termination_per_generation:
413 | 			if self.potential_uap:
414 | 				print("\nUAP candidates:")
415 | 				for candidate in range(len(self.potential_uap)):
416 | 					print('Sequence: {} -- Fitness: {} -- Evasions: {}'.format(self.potential_uap[candidate][0],
417 | 																			   round(self.potential_uap[candidate][1], 2),
418 | 																			   self.potential_uap[candidate][2]))
419 | 			return True
420 | 
421 | 		self.generationNumber += 1
422 | 		return False
423 | 


--------------------------------------------------------------------------------
/License:
--------------------------------------------------------------------------------
  1 | Copyright (c) Raphael Labaca Castro
  2 | 
  3 | Mozilla Public License Version 2.0
  4 | ==================================
  5 | 
  6 | 1. Definitions
  7 | --------------
  8 | 
  9 | 1.1. "Contributor"
 10 |     means each individual or legal entity that creates, contributes to
 11 |     the creation of, or owns Covered Software.
 12 | 
 13 | 1.2. "Contributor Version"
 14 |     means the combination of the Contributions of others (if any) used
 15 |     by a Contributor and that particular Contributor's Contribution.
 16 | 
 17 | 1.3. "Contribution"
 18 |     means Covered Software of a particular Contributor.
 19 | 
 20 | 1.4. "Covered Software"
 21 |     means Source Code Form to which the initial Contributor has attached
 22 |     the notice in Exhibit A, the Executable Form of such Source Code
 23 |     Form, and Modifications of such Source Code Form, in each case
 24 |     including portions thereof.
 25 | 
 26 | 1.5. "Incompatible With Secondary Licenses"
 27 |     means
 28 | 
 29 |     (a) that the initial Contributor has attached the notice described
 30 |         in Exhibit B to the Covered Software; or
 31 | 
 32 |     (b) that the Covered Software was made available under the terms of
 33 |         version 1.1 or earlier of the License, but not also under the
 34 |         terms of a Secondary License.
 35 | 
 36 | 1.6. "Executable Form"
 37 |     means any form of the work other than Source Code Form.
 38 | 
 39 | 1.7. "Larger Work"
 40 |     means a work that combines Covered Software with other material, in
 41 |     a separate file or files, that is not Covered Software.
 42 | 
 43 | 1.8. "License"
 44 |     means this document.
 45 | 
 46 | 1.9. "Licensable"
 47 |     means having the right to grant, to the maximum extent possible,
 48 |     whether at the time of the initial grant or subsequently, any and
 49 |     all of the rights conveyed by this License.
 50 | 
 51 | 1.10. "Modifications"
 52 |     means any of the following:
 53 | 
 54 |     (a) any file in Source Code Form that results from an addition to,
 55 |         deletion from, or modification of the contents of Covered
 56 |         Software; or
 57 | 
 58 |     (b) any new file in Source Code Form that contains any Covered
 59 |         Software.
 60 | 
 61 | 1.11. "Patent Claims" of a Contributor
 62 |     means any patent claim(s), including without limitation, method,
 63 |     process, and apparatus claims, in any patent Licensable by such
 64 |     Contributor that would be infringed, but for the grant of the
 65 |     License, by the making, using, selling, offering for sale, having
 66 |     made, import, or transfer of either its Contributions or its
 67 |     Contributor Version.
 68 | 
 69 | 1.12. "Secondary License"
 70 |     means either the GNU General Public License, Version 2.0, the GNU
 71 |     Lesser General Public License, Version 2.1, the GNU Affero General
 72 |     Public License, Version 3.0, or any later versions of those
 73 |     licenses.
 74 | 
 75 | 1.13. "Source Code Form"
 76 |     means the form of the work preferred for making modifications.
 77 | 
 78 | 1.14. "You" (or "Your")
 79 |     means an individual or a legal entity exercising rights under this
 80 |     License. For legal entities, "You" includes any entity that
 81 |     controls, is controlled by, or is under common control with You. For
 82 |     purposes of this definition, "control" means (a) the power, direct
 83 |     or indirect, to cause the direction or management of such entity,
 84 |     whether by contract or otherwise, or (b) ownership of more than
 85 |     fifty percent (50%) of the outstanding shares or beneficial
 86 |     ownership of such entity.
 87 | 
 88 | 2. License Grants and Conditions
 89 | --------------------------------
 90 | 
 91 | 2.1. Grants
 92 | 
 93 | Each Contributor hereby grants You a world-wide, royalty-free,
 94 | non-exclusive license:
 95 | 
 96 | (a) under intellectual property rights (other than patent or trademark)
 97 |     Licensable by such Contributor to use, reproduce, make available,
 98 |     modify, display, perform, distribute, and otherwise exploit its
 99 |     Contributions, either on an unmodified basis, with Modifications, or
100 |     as part of a Larger Work; and
101 | 
102 | (b) under Patent Claims of such Contributor to make, use, sell, offer
103 |     for sale, have made, import, and otherwise transfer either its
104 |     Contributions or its Contributor Version.
105 | 
106 | 2.2. Effective Date
107 | 
108 | The licenses granted in Section 2.1 with respect to any Contribution
109 | become effective for each Contribution on the date the Contributor first
110 | distributes such Contribution.
111 | 
112 | 2.3. Limitations on Grant Scope
113 | 
114 | The licenses granted in this Section 2 are the only rights granted under
115 | this License. No additional rights or licenses will be implied from the
116 | distribution or licensing of Covered Software under this License.
117 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
118 | Contributor:
119 | 
120 | (a) for any code that a Contributor has removed from Covered Software;
121 |     or
122 | 
123 | (b) for infringements caused by: (i) Your and any other third party's
124 |     modifications of Covered Software, or (ii) the combination of its
125 |     Contributions with other software (except as part of its Contributor
126 |     Version); or
127 | 
128 | (c) under Patent Claims infringed by Covered Software in the absence of
129 |     its Contributions.
130 | 
131 | This License does not grant any rights in the trademarks, service marks,
132 | or logos of any Contributor (except as may be necessary to comply with
133 | the notice requirements in Section 3.4).
134 | 
135 | 2.4. Subsequent Licenses
136 | 
137 | No Contributor makes additional grants as a result of Your choice to
138 | distribute the Covered Software under a subsequent version of this
139 | License (see Section 10.2) or under the terms of a Secondary License (if
140 | permitted under the terms of Section 3.3).
141 | 
142 | 2.5. Representation
143 | 
144 | Each Contributor represents that the Contributor believes its
145 | Contributions are its original creation(s) or it has sufficient rights
146 | to grant the rights to its Contributions conveyed by this License.
147 | 
148 | 2.6. Fair Use
149 | 
150 | This License is not intended to limit any rights You have under
151 | applicable copyright doctrines of fair use, fair dealing, or other
152 | equivalents.
153 | 
154 | 2.7. Conditions
155 | 
156 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
157 | in Section 2.1.
158 | 
159 | 3. Responsibilities
160 | -------------------
161 | 
162 | 3.1. Distribution of Source Form
163 | 
164 | All distribution of Covered Software in Source Code Form, including any
165 | Modifications that You create or to which You contribute, must be under
166 | the terms of this License. You must inform recipients that the Source
167 | Code Form of the Covered Software is governed by the terms of this
168 | License, and how they can obtain a copy of this License. You may not
169 | attempt to alter or restrict the recipients' rights in the Source Code
170 | Form.
171 | 
172 | 3.2. Distribution of Executable Form
173 | 
174 | If You distribute Covered Software in Executable Form then:
175 | 
176 | (a) such Covered Software must also be made available in Source Code
177 |     Form, as described in Section 3.1, and You must inform recipients of
178 |     the Executable Form how they can obtain a copy of such Source Code
179 |     Form by reasonable means in a timely manner, at a charge no more
180 |     than the cost of distribution to the recipient; and
181 | 
182 | (b) You may distribute such Executable Form under the terms of this
183 |     License, or sublicense it under different terms, provided that the
184 |     license for the Executable Form does not attempt to limit or alter
185 |     the recipients' rights in the Source Code Form under this License.
186 | 
187 | 3.3. Distribution of a Larger Work
188 | 
189 | You may create and distribute a Larger Work under terms of Your choice,
190 | provided that You also comply with the requirements of this License for
191 | the Covered Software. If the Larger Work is a combination of Covered
192 | Software with a work governed by one or more Secondary Licenses, and the
193 | Covered Software is not Incompatible With Secondary Licenses, this
194 | License permits You to additionally distribute such Covered Software
195 | under the terms of such Secondary License(s), so that the recipient of
196 | the Larger Work may, at their option, further distribute the Covered
197 | Software under the terms of either this License or such Secondary
198 | License(s).
199 | 
200 | 3.4. Notices
201 | 
202 | You may not remove or alter the substance of any license notices
203 | (including copyright notices, patent notices, disclaimers of warranty,
204 | or limitations of liability) contained within the Source Code Form of
205 | the Covered Software, except that You may alter any license notices to
206 | the extent required to remedy known factual inaccuracies.
207 | 
208 | 3.5. Application of Additional Terms
209 | 
210 | You may choose to offer, and to charge a fee for, warranty, support,
211 | indemnity or liability obligations to one or more recipients of Covered
212 | Software. However, You may do so only on Your own behalf, and not on
213 | behalf of any Contributor. You must make it absolutely clear that any
214 | such warranty, support, indemnity, or liability obligation is offered by
215 | You alone, and You hereby agree to indemnify every Contributor for any
216 | liability incurred by such Contributor as a result of warranty, support,
217 | indemnity or liability terms You offer. You may include additional
218 | disclaimers of warranty and limitations of liability specific to any
219 | jurisdiction.
220 | 
221 | 4. Inability to Comply Due to Statute or Regulation
222 | ---------------------------------------------------
223 | 
224 | If it is impossible for You to comply with any of the terms of this
225 | License with respect to some or all of the Covered Software due to
226 | statute, judicial order, or regulation then You must: (a) comply with
227 | the terms of this License to the maximum extent possible; and (b)
228 | describe the limitations and the code they affect. Such description must
229 | be placed in a text file included with all distributions of the Covered
230 | Software under this License. Except to the extent prohibited by statute
231 | or regulation, such description must be sufficiently detailed for a
232 | recipient of ordinary skill to be able to understand it.
233 | 
234 | 5. Termination
235 | --------------
236 | 
237 | 5.1. The rights granted under this License will terminate automatically
238 | if You fail to comply with any of its terms. However, if You become
239 | compliant, then the rights granted under this License from a particular
240 | Contributor are reinstated (a) provisionally, unless and until such
241 | Contributor explicitly and finally terminates Your grants, and (b) on an
242 | ongoing basis, if such Contributor fails to notify You of the
243 | non-compliance by some reasonable means prior to 60 days after You have
244 | come back into compliance. Moreover, Your grants from a particular
245 | Contributor are reinstated on an ongoing basis if such Contributor
246 | notifies You of the non-compliance by some reasonable means, this is the
247 | first time You have received notice of non-compliance with this License
248 | from such Contributor, and You become compliant prior to 30 days after
249 | Your receipt of the notice.
250 | 
251 | 5.2. If You initiate litigation against any entity by asserting a patent
252 | infringement claim (excluding declaratory judgment actions,
253 | counter-claims, and cross-claims) alleging that a Contributor Version
254 | directly or indirectly infringes any patent, then the rights granted to
255 | You by any and all Contributors for the Covered Software under Section
256 | 2.1 of this License shall terminate.
257 | 
258 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
259 | end user license agreements (excluding distributors and resellers) which
260 | have been validly granted by You or Your distributors under this License
261 | prior to termination shall survive termination.
262 | 
263 | ************************************************************************
264 | *                                                                      *
265 | *  6. Disclaimer of Warranty                                           *
266 | *  -------------------------                                           *
267 | *                                                                      *
268 | *  Covered Software is provided under this License on an "as is"       *
269 | *  basis, without warranty of any kind, either expressed, implied, or  *
270 | *  statutory, including, without limitation, warranties that the       *
271 | *  Covered Software is free of defects, merchantable, fit for a        *
272 | *  particular purpose or non-infringing. The entire risk as to the     *
273 | *  quality and performance of the Covered Software is with You.        *
274 | *  Should any Covered Software prove defective in any respect, You     *
275 | *  (not any Contributor) assume the cost of any necessary servicing,   *
276 | *  repair, or correction. This disclaimer of warranty constitutes an   *
277 | *  essential part of this License. No use of any Covered Software is   *
278 | *  authorized under this License except under this disclaimer.         *
279 | *                                                                      *
280 | ************************************************************************
281 | 
282 | ************************************************************************
283 | *                                                                      *
284 | *  7. Limitation of Liability                                          *
285 | *  --------------------------                                          *
286 | *                                                                      *
287 | *  Under no circumstances and under no legal theory, whether tort      *
288 | *  (including negligence), contract, or otherwise, shall any           *
289 | *  Contributor, or anyone who distributes Covered Software as          *
290 | *  permitted above, be liable to You for any direct, indirect,         *
291 | *  special, incidental, or consequential damages of any character      *
292 | *  including, without limitation, damages for lost profits, loss of    *
293 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
294 | *  and all other commercial damages or losses, even if such party      *
295 | *  shall have been informed of the possibility of such damages. This   *
296 | *  limitation of liability shall not apply to liability for death or   *
297 | *  personal injury resulting from such party's negligence to the       *
298 | *  extent applicable law prohibits such limitation. Some               *
299 | *  jurisdictions do not allow the exclusion or limitation of           *
300 | *  incidental or consequential damages, so this exclusion and          *
301 | *  limitation may not apply to You.                                    *
302 | *                                                                      *
303 | ************************************************************************
304 | 
305 | 8. Litigation
306 | -------------
307 | 
308 | Any litigation relating to this License may be brought only in the
309 | courts of a jurisdiction where the defendant maintains its principal
310 | place of business and such litigation shall be governed by laws of that
311 | jurisdiction, without reference to its conflict-of-law provisions.
312 | Nothing in this Section shall prevent a party's ability to bring
313 | cross-claims or counter-claims.
314 | 
315 | 9. Miscellaneous
316 | ----------------
317 | 
318 | This License represents the complete agreement concerning the subject
319 | matter hereof. If any provision of this License is held to be
320 | unenforceable, such provision shall be reformed only to the extent
321 | necessary to make it enforceable. Any law or regulation which provides
322 | that the language of a contract shall be construed against the drafter
323 | shall not be used to construe this License against a Contributor.
324 | 
325 | 10. Versions of the License
326 | ---------------------------
327 | 
328 | 10.1. New Versions
329 | 
330 | Mozilla Foundation is the license steward. Except as provided in Section
331 | 10.3, no one other than the license steward has the right to modify or
332 | publish new versions of this License. Each version will be given a
333 | distinguishing version number.
334 | 
335 | 10.2. Effect of New Versions
336 | 
337 | You may distribute the Covered Software under the terms of the version
338 | of the License under which You originally received the Covered Software,
339 | or under the terms of any subsequent version published by the license
340 | steward.
341 | 
342 | 10.3. Modified Versions
343 | 
344 | If you create software not governed by this License, and you want to
345 | create a new license for such software, you may create and use a
346 | modified version of this License if you rename the license and remove
347 | any references to the name of the license steward (except to note that
348 | such modified license differs from this License).
349 | 
350 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
351 | Licenses
352 | 
353 | If You choose to distribute Source Code Form that is Incompatible With
354 | Secondary Licenses under the terms of this version of the License, the
355 | notice described in Exhibit B of this License must be attached.
356 | 
357 | Exhibit A - Source Code Form License Notice
358 | -------------------------------------------
359 | 
360 |   This Source Code Form is subject to the terms of the Mozilla Public
361 |   License, v. 2.0. If a copy of the MPL was not distributed with this
362 |   file, You can obtain one at http://mozilla.org/MPL/2.0/.
363 | 
364 | If it is not possible or desirable to put the notice in a particular
365 | file, then You may include the notice in a location (such as a LICENSE
366 | file in a relevant directory) where a recipient would be likely to look
367 | for such a notice.
368 | 
369 | You may add additional accurate notices of copyright ownership.
370 | 
371 | Exhibit B - "Incompatible With Secondary Licenses" Notice
372 | ---------------------------------------------------------
373 | 
374 |   This Source Code Form is "Incompatible With Secondary Licenses", as
375 |   defined by the Mozilla Public License, v. 2.0.
376 | 


--------------------------------------------------------------------------------
/src/defense.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import time
  4 | import joblib
  5 | import numpy as np
  6 | import src.config as cfg
  7 | import src.functions as f
  8 | import lightgbm as lgb
  9 | from sklearn.metrics import roc_auc_score
 10 | from data.pefeatures import PEFeatureReader
 11 | from sklearn.preprocessing import StandardScaler
 12 | from sklearn.linear_model import LogisticRegression
 13 | 
 14 | 
 15 | NPZ_PATH = cfg.file['paths']['npz']
 16 | MODEL_PATH = cfg.file['paths']['model_path']
 17 | VECTORIZED_PATH = cfg.file['paths']['vectorized_path']
 18 | 
 19 | 
 20 | class Defense:
 21 | 
 22 |     def __init__(self, model, csv_path, features_path, number_examples):
 23 |         self.model = model
 24 |         self.csv_path = csv_path
 25 |         self.features_path = features_path
 26 |         self.number_examples = number_examples
 27 | 
 28 |     @staticmethod
 29 |     def create_uap_datasets(csv_path, features_path, uap_vector):
 30 |         """
 31 |             Apply UAP to exploration set to validate results from exploration set.
 32 |             Create two feature-space datasets:
 33 |             i) original examples (before UAP injection)
 34 |             ii) adversarial examples (after injecting UAP)
 35 | 
 36 |             Input:
 37 |                 csv_path: path to load CSV file with malware examples
 38 |                 features_path: path to save features from examples
 39 |                 uap_vector: UAP vector calculated using model and dataset
 40 |         """
 41 | 
 42 |         # Save features from problem-space malware
 43 |         f.save_features_malware(csv_path=csv_path, features_path=features_path, pert_vector=uap_vector)
 44 | 
 45 |     @staticmethod
 46 |     def extract_perturbation_from_features(features_path):
 47 |         """
 48 |             Extract noise / perturbation from features of adversarial examples
 49 |             subtracted from the original malware it was generated from.
 50 |             i) Load original and adversarial datasets (features)
 51 |             ii) Subtract original - adversarial leaving only noise
 52 | 
 53 |             Input:
 54 |                 features_path: path to load features from examples
 55 |         """
 56 |         # Load datasets of original & adversarial examples to extract noise
 57 |         adv_examples = np.load(features_path + 'adv_examples_uap_compress.npz')
 58 |         original_malware = np.load(features_path + 'orig_files_uap_compress.npz')
 59 | 
 60 |         # Extract features
 61 |         features_adv_examples = np.array(adv_examples['features'])
 62 |         features_original = np.array(original_malware['features'])
 63 | 
 64 |         # Calculate noise / perturbation based on adversarial - original malware
 65 |         noise = features_adv_examples - features_original
 66 | 
 67 |         return noise
 68 | 
 69 |     # DEFENSE
 70 | 
 71 |     # Define statistical model to generate adversarial examples
 72 |     @staticmethod
 73 |     def attack_statistical_model(malware_input, noise):
 74 |         """
 75 |             Define statistical model to approximate noise / perturbations.
 76 |             Author: Luis Munoz Gonzalez
 77 | 
 78 |             Input:
 79 |                 malware_input: batch of malware examples
 80 |                 noise: perturbation injected to generate adversarial examples
 81 |         """
 82 |         # Number of features
 83 |         number_features = malware_input.shape[1]
 84 | 
 85 |         # Define mean and standard var
 86 |         meanV = np.zeros(number_features)
 87 |         stdV = np.zeros(number_features)
 88 | 
 89 |         # Assign value to meanV and stdV
 90 |         for each_feat in range(number_features):
 91 |             meanV[each_feat] = np.mean(noise[:, each_feat])
 92 |             stdV[each_feat] = np.std(noise[:, each_feat])
 93 | 
 94 |         # Generate adversarial examples
 95 |         adv_ex = np.zeros(malware_input.shape)
 96 |         for e in range(meanV.size):
 97 |             rd = np.random.randn(malware_input.shape[0]) * stdV[e] + meanV[e]
 98 |             adv_ex[:, e] = malware_input[:, e] + rd
 99 | 
100 |         return adv_ex
101 | 
102 |     def generate_adv_examples_statistical_model(self, malware_batch, noise, npz_path):
103 |         """
104 |             Generate adversarial examples using statistical model that approximates
105 |             a function based on the noise extracted from the features (in this case Gaussian)
106 |         """
107 |         # Load adversarial examples or generate them using statistical model
108 |         if os.path.exists(npz_path + 'adversarial_examples_approximated.npz'):
109 |             adv_examples_approximated = np.load(npz_path + 'adversarial_examples_approximated.npz')
110 |             adv_examples_approximated = adv_examples_approximated['features']
111 | 
112 |         else:
113 |             # Generate same number of adversarial examples as malicious
114 |             adv_examples_approximated = self.attack_statistical_model(malware_batch, noise)
115 | 
116 |             # Saving adversarial examples generated with the statistical model above
117 |             np.savez(npz_path + 'adversarial_examples_approximated.npz', features=adv_examples_approximated)
118 | 
119 |         print('Adversarial data shape:', adv_examples_approximated.shape)
120 |         return adv_examples_approximated
121 | 
122 |     def adversarial_training(self, noise):
123 |         """
124 |             Perform adversarial training using 1/2 of dataset with adversarial
125 |             examples + 1/2 with benign (pure) or 1/4 of dataset with adversarial
126 |             examples + 1/4 with malicious, and 1/2 of dataset benign (mixed).
127 |             Also, train a baseline model to use as a benchmark.
128 | 
129 |             i) Baseline: Train model with N malware and N benign examples.
130 | 
131 |             ii) Pure: Adversarially-train model
132 |                 a) generate N adversarial samples with statistical model
133 |                 b) Train using N statistically-generated adversarial examples
134 |                 and N benign examples.
135 | 
136 |             iii) Mixed: Adversarially-train model
137 |                 a) generate N/2 adversarial samples with statistical model
138 |                 b) Train using N/2 statistically-generated adversarial examples,
139 |                 50k malware and N benign examples.
140 | 
141 |             Input:
142 |                 noise: information sampled from features of adversarial examples and original files
143 | 
144 |         """
145 | 
146 |         # Define size of malicious, benign, and adversarial datasets
147 |         # number_examples = 50000
148 | 
149 |         # Load EMBER data
150 |         print('Loading datasets to train baseline & adversarial models: ')
151 |         feature_reader = PEFeatureReader()
152 |         X_train, y_train = feature_reader.read_vectorized_features(VECTORIZED_PATH, 'train', feature_version=1)
153 |         if self.number_examples == 50000:
154 |             start_examples = 38800
155 |             end_examples = 189000
156 |             X_train = X_train[start_examples:end_examples]
157 |             y_train = y_train[start_examples:end_examples]
158 |         print('Original features shape:', X_train.shape)
159 | 
160 |         # Filter only malicious
161 |         malicious_rows = (y_train == 1)
162 |         malware_batch = X_train[malicious_rows]
163 |         malware_batch = malware_batch[:self.number_examples]
164 |         print('Malicious features shape:', malware_batch.shape)
165 | 
166 |         # Filter only benign
167 |         benign_rows = (y_train == 0)
168 |         benign_batch = X_train[benign_rows]
169 |         benign_batch = benign_batch[:self.number_examples]
170 |         print('Benign features shape:', benign_batch.shape)
171 | 
172 |         # Generate adversarial examples for adversarial training
173 |         adversarial_batch = self.generate_adv_examples_statistical_model(malware_batch, noise, npz_path=NPZ_PATH)
174 | 
175 |         print('\na) Train LGBM baseline with {} malicious and {} benign files'.format(self.number_examples,
176 |                                                                                       self.number_examples))
177 | 
178 |         # Define datasets for baseline training
179 | 
180 |         # Load model if they already exist, otherwise train them
181 |         if os.path.exists(MODEL_PATH + 'ember_model_baseline.pkl'):
182 |             lgbm_model_baseline = joblib.load(MODEL_PATH + 'ember_model_baseline.pkl')
183 | 
184 |         else:
185 | 
186 |             X_train = np.concatenate((malware_batch, benign_batch), axis=0)
187 |             y_train = np.concatenate((np.ones(self.number_examples), np.zeros(self.number_examples)), axis=0)
188 |             print('Train data shape for baseline model:', X_train.shape)
189 | 
190 |             # Create dataset for training
191 |             lgbm_dataset = lgb.Dataset(X_train, y_train)
192 |             print('Finished preparing dataset for training.')
193 | 
194 |             # Define parameters & train
195 |             start_training = time.time()
196 |             params = {"application": "binary"}
197 |             lgbm_model_baseline = lgb.train(params, lgbm_dataset)
198 |             print('Training time: {} mins'.format(round((time.time() - start_training) / 60, 2)))
199 | 
200 |             lgbm_model_baseline.save_model(MODEL_PATH + 'ember_model_baseline.txt')
201 |             joblib.dump(lgbm_model_baseline, MODEL_PATH + 'ember_model_baseline.pkl')
202 |             print('Baseline model saved.')
203 | 
204 |         print('b) Adversarial train (Pure) LGBM with {} adversarial and {} benign  files'.format(self.number_examples,
205 |                                                                                                  self.number_examples))
206 | 
207 |         # Define dataset for adversarially trained model 'pure' (AEs und benign)
208 | 
209 |         # Load model if they already exist, otherwise train them
210 |         if os.path.exists(MODEL_PATH + 'ember_model_adv_trained_pure.pkl'):
211 |             lgbm_model_adv_trained_pure = joblib.load(MODEL_PATH + 'ember_model_adv_trained_pure.pkl')
212 | 
213 |         else:
214 | 
215 |             X_train = np.concatenate((adversarial_batch, benign_batch), axis=0)
216 |             y_train = np.concatenate((np.ones(self.number_examples), np.zeros(self.number_examples)), axis=0)
217 |             print('Train data shape for pure adversarial model:', X_train.shape)
218 | 
219 |             # Create dataset for training
220 |             lgbm_dataset = lgb.Dataset(X_train, y_train)
221 |             print('Finished preparing dataset for training.')
222 | 
223 |             # Define parameters & train
224 |             start_training = time.time()
225 |             params = {"application": "binary"}
226 |             lgbm_model_adv_trained_pure = lgb.train(params, lgbm_dataset)
227 |             print('Training time: {} mins'.format(round((time.time() - start_training) / 60, 2)))
228 | 
229 |             lgbm_model_adv_trained_pure.save_model(MODEL_PATH + 'ember_model_adv_trained_pure.txt')
230 |             joblib.dump(lgbm_model_adv_trained_pure, MODEL_PATH + 'ember_model_adv_trained_pure.pkl')
231 |             print('Adversarially trained (pure) model saved.')
232 | 
233 |         # Define dataset for adversarially trained model 'mixed' (AEs + malware und benign)
234 | 
235 |         print('c) Adversarial train (Mixed) LGBM with {} adversarial, {} malicious, and {} benign files'.format(
236 |             int(self.number_examples / 2), int(self.number_examples / 2), self.number_examples))
237 | 
238 |         # Load model if they already exist, otherwise train them
239 |         if os.path.exists(MODEL_PATH + 'ember_model_adv_trained_mixed.pkl'):
240 |             lgbm_model_adv_trained_mixed = joblib.load(MODEL_PATH + 'ember_model_adv_trained_mixed.pkl')
241 | 
242 |         else:
243 |             number_examples_mal_adv = int(self.number_examples / 2)
244 |             X_train = np.concatenate(
245 |                 (malware_batch[:number_examples_mal_adv], adversarial_batch[:number_examples_mal_adv], benign_batch),
246 |                 axis=0)
247 |             y_train = np.concatenate((np.ones(self.number_examples), np.zeros(self.number_examples)), axis=0)
248 |             print('Train data shape for mixed adversarial model:', X_train.shape)
249 | 
250 |             # Create dataset for training
251 |             lgbm_dataset = lgb.Dataset(X_train, y_train)
252 |             print('Finished preparing dataset for training.')
253 | 
254 |             # Define params & train | with feature_version = 1 (2351)
255 |             start_training = time.time()
256 |             params = {"application": "binary"}
257 |             lgbm_model_adv_trained_mixed = lgb.train(params, lgbm_dataset)
258 |             print('Training time: {} mins'.format(round((time.time() - start_training) / 60, 2)))
259 | 
260 |             lgbm_model_adv_trained_mixed.save_model(MODEL_PATH + 'ember_model_adv_trained_mixed.txt')
261 |             joblib.dump(lgbm_model_adv_trained_mixed, MODEL_PATH + 'ember_model_adv_trained_mixed.pkl')
262 |             print('Adversarially trained (mixed) model saved.')
263 | 
264 |         return lgbm_model_baseline, lgbm_model_adv_trained_pure, lgbm_model_adv_trained_mixed
265 | 
266 |     def train_logit(self, model_path):
267 |         """
268 |             Training a logistic regression model.
269 | 
270 |             Input:
271 |                 model_path: path to save & load trained logit model
272 |         """
273 |         time_all = time.time()
274 | 
275 |         # Load EMBER data
276 |         print('\nLoading datasets to train LR model: ')
277 |         feature_reader = PEFeatureReader()
278 |         X_train, y_train, X_test, y_test = feature_reader.read_vectorized_features(VECTORIZED_PATH, feature_version=1)
279 |         if self.number_examples == 50000:
280 |             start_examples = 38800
281 |             end_examples = 189000
282 |             X_train = X_train[start_examples:end_examples]
283 |             y_train = y_train[start_examples:end_examples]
284 |         print('Original features shape:', X_train.shape)
285 | 
286 |         # Selecting less samples to avoid crashing if working with notebook
287 |         # minvalue = 0
288 |         # maxvalue = 900000 # Für 100.000 mit AUC 0.94.
289 |         # X_train = X_train[minvalue:maxvalue]
290 |         # y_train = y_train[minvalue:maxvalue]
291 |         # print('Current data shape:', X_train.shape)
292 | 
293 |         # Filter out unlabeled
294 |         train_rows = (y_train != -1)
295 |         X_train = X_train[train_rows]
296 |         y_train = y_train[train_rows]
297 |         print('Filtered features shape:', X_train.shape)
298 | 
299 |         # If trained data reduced adjust test data
300 |         if self.number_examples == 50000:
301 |             test_examples = 30000
302 |             X_test = X_test[:test_examples]
303 |             y_test = y_test[:test_examples]
304 |         print('Test features shape:', X_test.shape)
305 | 
306 |         # Scale data
307 |         norm_std_scaler = StandardScaler().fit(X_train)
308 |         X_train = norm_std_scaler.transform(X_train)
309 |         X_test = norm_std_scaler.transform(X_test)
310 | 
311 |         # Load the pre-trained logit model
312 |         if os.path.exists(model_path + 'logit_ember.pkl'):
313 |             clf_LR = joblib.load(model_path + 'logit_ember.pkl')
314 |         else:
315 |             # Train the model on the dataset
316 |             print('Model not found, LR will be trained..')
317 |             clf_LR = LogisticRegression(random_state=24)
318 |             clf_LR = clf_LR.fit(X_train, y_train)
319 |             joblib.dump(clf_LR, model_path + 'logit_ember.pkl')
320 | 
321 |             # Show processing time in h:m:s
322 |             m, s = divmod(time.time() - time_all, 60)
323 |             h, m = divmod(m, 60)
324 |             print("Time elapsed training logit: %d:%02d:%02d" % (h, m, s))
325 | 
326 |         # Calculate predictions with LR model
327 |         print("Model {}".format(clf_LR.__class__.__name__))
328 |         y_pred = clf_LR.predict(X_test)
329 |         print("ROC-AUC LR:", roc_auc_score(y_test, y_pred))
330 | 
331 |         return clf_LR
332 | 
333 |     @staticmethod
334 |     def extract_important_features(model, features_path):
335 |         """
336 |             Extract most important features of logit model.
337 |         """
338 |         # Get importance weights for LR model
339 |         importance = model.coef_
340 |         importance = importance[0]
341 | 
342 |         # Collect more indexes for features with same weight of importance (excluding features = 0)
343 |         repeated_indexes = []
344 |         repeated_values = []
345 |         for i, v in enumerate(importance):
346 |             curr_repeated_indexes = [idx for idx in range(len(importance)) if importance[idx] == importance[i]]
347 |             if len(curr_repeated_indexes) > 1 and v != 0:
348 |                 repeated_indexes.append(curr_repeated_indexes)
349 |                 repeated_values.append(v)
350 | 
351 |         if repeated_indexes:  # Only 46 if 0.0 is included as feature value (same weight)
352 |             print(len(repeated_indexes), repeated_indexes)
353 |             print(len(repeated_values), repeated_values)
354 | 
355 |         # Get n important features & indexes
356 |         j = 474  # arbitrarily chosen ~20% of 2351
357 |         top_j_features = sorted(importance, reverse=True)[:j]
358 |         indices = [list(importance).index(value) for value in top_j_features]
359 |         print('\nIdentified top 20% features based on feature importances of LR.')
360 |         # print('Top {} values: {}'.format(j, top_j_features))
361 |         # print('Top {} indexes: {}'.format(j, indices))
362 |         print()
363 | 
364 |         np.savez(features_path + 'top_features_LR_importances_indices', indices)
365 | 
366 |         return indices
367 | 
368 |     def train_lgbm_important_features(self, features_path):
369 |         """
370 |             Train the LightGBM model with the EMBER dataset using only the top
371 |             features based on the Logit most important features.
372 | 
373 |             Input:
374 |                 features_path: path to save features from examples
375 |         """
376 | 
377 |         # Load EMBER data
378 |         print('Loading datasets to train LGBM with feature reduction: ')
379 |         feature_reader = PEFeatureReader()
380 |         X_train, y_train, X_test, _ = feature_reader.read_vectorized_features(VECTORIZED_PATH, feature_version=1)
381 |         if self.number_examples == 50000:
382 |             start_examples = 38800
383 |             end_examples = 189000
384 |             X_train = X_train[start_examples:end_examples]
385 |             y_train = y_train[start_examples:end_examples]
386 |         print('Original features shape:', X_train.shape)
387 | 
388 |         # Filter unlabeled data
389 |         train_rows = (y_train != -1)
390 |         X_train = X_train[train_rows]
391 |         y_train = y_train[train_rows]
392 |         print('Filtered features shape:', X_train.shape)
393 | 
394 |         # If trained data reduced adjust test data
395 |         if self.number_examples == 50000:
396 |             test_examples = 30000
397 |             X_test = X_test[:test_examples]
398 |         print('Test features shape:', X_test.shape)
399 | 
400 |         # Use only 20% of highest importance features based on Logit model
401 |         top_features_LR_importances = np.load(features_path + 'top_features_LR_importances_indices.npz')
402 |         top_features_LR_importances = top_features_LR_importances['arr_0']
403 |         X_train = X_train[:, top_features_LR_importances]
404 |         print('Top 20% features shape:', X_train.shape)
405 | 
406 |         # Create dataset for training
407 |         lgbm_dataset = lgb.Dataset(X_train, y_train)
408 |         # print('Finished preparing dataset for training.\n')
409 | 
410 |         # Define parameters & train
411 |         start_training = time.time()
412 |         params = {"application": "binary"}
413 |         lgbm_model_reduced = lgb.train(params, lgbm_dataset)
414 |         print('Training time: {} mins'.format(round((time.time() - start_training) / 60, 2)))
415 | 
416 |         lgbm_model_reduced.save_model(MODEL_PATH + 'ember_model_reduced.txt')
417 |         joblib.dump(lgbm_model_reduced, MODEL_PATH + 'ember_model_reduced.pkl')
418 |         print('Feature-reduced model saved.\n')
419 | 
420 |         return lgbm_model_reduced
421 | 
422 |     def train_feature_reduction(self, model_path, features_path):
423 |         """
424 |           i) Train a Logistic Regression Model
425 |          ii) Extract feature_importances from Logit
426 |         iii) Retrain LGBM with 20% most important features
427 |         """
428 |         # Train Logit
429 |         clf_LR = self.train_logit(model_path=model_path)
430 | 
431 |         # Extract feature_importances from Logit
432 |         self.extract_important_features(model=clf_LR, features_path=features_path)
433 | 
434 |         # Retrain LGBM with 20% most important features
435 |         self.train_lgbm_important_features(features_path=features_path)
436 | 


--------------------------------------------------------------------------------
/data/pefeatures.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | ''' Extracts some basic features from PE files. Many of the features
  3 | implemented have been used in previously published works. For more information,
  4 | check out the following resources:
  5 | * Schultz, et al., 2001: http://128.59.14.66/sites/default/files/binaryeval-ieeesp01.pdf
  6 | * Kolter and Maloof, 2006: http://www.jmlr.org/papers/volume7/kolter06a/kolter06a.pdf
  7 | * Shafiq et al., 2009: https://www.researchgate.net/profile/Fauzan_Mirza/publication/242084613_A_Framework_for_Efficient_Mining_of_Structural_Information_to_Detect_Zero-Day_Malicious_Portable_Executables/links/0c96052e191668c3d5000000.pdf
  8 | * Raman, 2012: http://2012.infosecsouthwest.com/files/speaker_materials/ISSW2012_Selecting_Features_to_Classify_Malware.pdf
  9 | * Saxe and Berlin, 2015: https://arxiv.org/pdf/1508.03096.pdf
 10 | 
 11 | It may be useful to do feature selection to reduce this set of features to a meaningful set
 12 | for your modeling problem.
 13 | 
 14 | Source: https://github.com/endgameinc/ember
 15 | '''
 16 | import os
 17 | import re
 18 | import lief
 19 | import hashlib
 20 | import numpy as np
 21 | from sklearn.feature_extraction import FeatureHasher
 22 | 
 23 | LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.')
 24 | LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or ( int(LIEF_MAJOR)==0 and int(LIEF_MINOR) >= 10 )
 25 | 
 26 | 
 27 | class FeatureType(object):
 28 |     ''' Base class from which each feature type may inherit '''
 29 | 
 30 |     name = ''
 31 |     dim = 0
 32 | 
 33 |     def __repr__(self):
 34 |         return '{}({})'.format(self.name, self.dim)
 35 | 
 36 |     def raw_features(self, bytez, lief_binary):
 37 |         ''' Generate a JSON-able representation of the file '''
 38 |         raise (NotImplementedError)
 39 | 
 40 |     def process_raw_features(self, raw_obj):
 41 |         ''' Generate a feature vector from the raw features '''
 42 |         raise (NotImplementedError)
 43 | 
 44 |     def feature_vector(self, bytez, lief_binary):
 45 |         ''' Directly calculate the feature vector from the sample itself. This should only be implemented differently
 46 |         if there are significant speedups to be gained from combining the two functions. '''
 47 |         return self.process_raw_features(self.raw_features(bytez, lief_binary))
 48 | 
 49 | 
 50 | class ByteHistogram(FeatureType):
 51 |     ''' Byte histogram (count + non-normalized) over the entire binary file '''
 52 | 
 53 |     name = 'histogram'
 54 |     dim = 256
 55 | 
 56 |     def __init__(self):
 57 |         super(FeatureType, self).__init__()
 58 | 
 59 |     def raw_features(self, bytez, lief_binary):
 60 |         counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256)
 61 |         return counts.tolist()
 62 | 
 63 |     def process_raw_features(self, raw_obj):
 64 |         counts = np.array(raw_obj, dtype=np.float32)
 65 |         sum = counts.sum()
 66 |         normalized = counts / sum
 67 |         return normalized
 68 | 
 69 | 
 70 | class ByteEntropyHistogram(FeatureType):
 71 |     ''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015).
 72 |     This roughly approximates the joint probability of byte value and local entropy.
 73 |     See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info.
 74 |     '''
 75 | 
 76 |     name = 'byteentropy'
 77 |     dim = 256
 78 | 
 79 |     def __init__(self, step=1024, window=2048):
 80 |         super(FeatureType, self).__init__()
 81 |         self.window = window
 82 |         self.step = step
 83 | 
 84 |     def _entropy_bin_counts(self, block):
 85 |         # coarse histogram, 16 bytes per bin
 86 |         c = np.bincount(block >> 4, minlength=16)  # 16-bin histogram
 87 |         p = c.astype(np.float32) / self.window
 88 |         wh = np.where(c)[0]
 89 |         H = np.sum(-p[wh] * np.log2(
 90 |             p[wh])) * 2  # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits)
 91 | 
 92 |         Hbin = int(H * 2)  # up to 16 bins (max entropy is 8 bits)
 93 |         if Hbin == 16:  # handle entropy = 8.0 bits
 94 |             Hbin = 15
 95 | 
 96 |         return Hbin, c
 97 | 
 98 |     def raw_features(self, bytez, lief_binary):
 99 |         output = np.zeros((16, 16), dtype=np.int)
100 |         a = np.frombuffer(bytez, dtype=np.uint8)
101 |         if a.shape[0] < self.window:
102 |             Hbin, c = self._entropy_bin_counts(a)
103 |             output[Hbin, :] += c
104 |         else:
105 |             # strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html
106 |             shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window)
107 |             strides = a.strides + (a.strides[-1],)
108 |             blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :]
109 | 
110 |             # from the blocks, compute histogram
111 |             for block in blocks:
112 |                 Hbin, c = self._entropy_bin_counts(block)
113 |                 output[Hbin, :] += c
114 | 
115 |         return output.flatten().tolist()
116 | 
117 |     def process_raw_features(self, raw_obj):
118 |         counts = np.array(raw_obj, dtype=np.float32)
119 |         sum = counts.sum()
120 |         normalized = counts / sum
121 |         return normalized
122 | 
123 | 
124 | class SectionInfo(FeatureType):
125 |     ''' Information about section names, sizes and entropy.  Uses hashing trick
126 |     to summarize all this section info into a feature vector.
127 |     '''
128 | 
129 |     name = 'section'
130 |     dim = 5 + 50 + 50 + 50 + 50 + 50
131 | 
132 |     def __init__(self):
133 |         super(FeatureType, self).__init__()
134 | 
135 |     @staticmethod
136 |     def _properties(s):
137 |         return [str(c).split('.')[-1] for c in s.characteristics_lists]
138 | 
139 |     def raw_features(self, bytez, lief_binary):
140 |         if lief_binary is None:
141 |             return {"entry": "", "sections": []}
142 | 
143 |         # properties of entry point, or if invalid, the first executable section
144 |         try:
145 |             entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name
146 |         except lief.not_found:
147 |             # bad entry point, let's find the first executable section
148 |             entry_section = ""
149 |             for s in lief_binary.sections:
150 |                 if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists:
151 |                     entry_section = s.name
152 |                     break
153 | 
154 |         raw_obj = {"entry": entry_section}
155 |         raw_obj["sections"] = [{
156 |             'name': s.name,
157 |             'size': s.size,
158 |             'entropy': s.entropy,
159 |             'vsize': s.virtual_size,
160 |             'props': self._properties(s)
161 |         } for s in lief_binary.sections]
162 |         return raw_obj
163 | 
164 |     def process_raw_features(self, raw_obj):
165 |         sections = raw_obj['sections']
166 |         general = [
167 |             len(sections),  # total number of sections
168 |             # number of sections with nonzero size
169 |             sum(1 for s in sections if s['size'] == 0),
170 |             # number of sections with an empty name
171 |             sum(1 for s in sections if s['name'] == ""),
172 |             # number of RX
173 |             sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),
174 |             # number of W
175 |             sum(1 for s in sections if 'MEM_WRITE' in s['props'])
176 |         ]
177 |         # gross characteristics of each section
178 |         section_sizes = [(s['name'], s['size']) for s in sections]
179 |         section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]
180 |         section_entropy = [(s['name'], s['entropy']) for s in sections]
181 |         section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
182 |         section_vsize = [(s['name'], s['vsize']) for s in sections]
183 |         section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
184 |         entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0]
185 |         characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
186 |         characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]
187 | 
188 |         return np.hstack([
189 |             general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed,
190 |             characteristics_hashed
191 |         ]).astype(np.float32)
192 | 
193 | 
194 | class ImportsInfo(FeatureType):
195 |     ''' Information about imported libraries and functions from the
196 |     import address table.  Note that the total number of imported
197 |     functions is contained in GeneralFileInfo.
198 |     '''
199 | 
200 |     name = 'imports'
201 |     dim = 1280
202 | 
203 |     def __init__(self):
204 |         super(FeatureType, self).__init__()
205 | 
206 |     def raw_features(self, bytez, lief_binary):
207 |         imports = {}
208 |         if lief_binary is None:
209 |             return imports
210 | 
211 |         for lib in lief_binary.imports:
212 |             if lib.name not in imports:
213 |                 imports[lib.name] = []  # libraries can be duplicated in listing, extend instead of overwrite
214 | 
215 |             # Clipping assumes there are diminishing returns on the discriminatory power of imported functions
216 |             #  beyond the first 10000 characters, and this will help limit the dataset size
217 |             for entry in lib.entries:
218 |                 if entry.is_ordinal:
219 |                     imports[lib.name].append("ordinal" + str(entry.ordinal))
220 |                 else:
221 |                     imports[lib.name].append(entry.name[:10000])
222 | 
223 |         return imports
224 | 
225 |     def process_raw_features(self, raw_obj):
226 |         # unique libraries
227 |         libraries = list(set([l.lower() for l in raw_obj.keys()]))
228 |         libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0]
229 | 
230 |         # A string like "kernel32.dll:CreateFileMappingA" for each imported function
231 |         imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist]
232 |         imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0]
233 | 
234 |         # Two separate elements: libraries (alone) and fully-qualified names of imported functions
235 |         return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32)
236 | 
237 | 
238 | class ExportsInfo(FeatureType):
239 |     ''' Information about exported functions. Note that the total number of exported
240 |     functions is contained in GeneralFileInfo.
241 |     '''
242 | 
243 |     name = 'exports'
244 |     dim = 128
245 | 
246 |     def __init__(self):
247 |         super(FeatureType, self).__init__()
248 | 
249 |     def raw_features(self, bytez, lief_binary):
250 |         if lief_binary is None:
251 |             return []
252 | 
253 |         # Clipping assumes there are diminishing returns on the discriminatory power of exports beyond
254 |         #  the first 10000 characters, and this will help limit the dataset size
255 |         if LIEF_EXPORT_OBJECT:
256 |             # export is an object with .name attribute (0.10.0 and later)
257 |             clipped_exports = [export.name[:10000] for export in lief_binary.exported_functions]
258 |         else:
259 |             # export is a string (LIEF 0.9.0 and earlier)
260 |             clipped_exports = [export[:10000] for export in lief_binary.exported_functions]
261 | 
262 | 
263 |         return clipped_exports
264 | 
265 |     def process_raw_features(self, raw_obj):
266 |         exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0]
267 |         return exports_hashed.astype(np.float32)
268 | 
269 | 
270 | class GeneralFileInfo(FeatureType):
271 |     ''' General information about the file '''
272 | 
273 |     name = 'general'
274 |     dim = 10
275 | 
276 |     def __init__(self):
277 |         super(FeatureType, self).__init__()
278 | 
279 |     def raw_features(self, bytez, lief_binary):
280 |         if lief_binary is None:
281 |             return {
282 |                 'size': len(bytez),
283 |                 'vsize': 0,
284 |                 'has_debug': 0,
285 |                 'exports': 0,
286 |                 'imports': 0,
287 |                 'has_relocations': 0,
288 |                 'has_resources': 0,
289 |                 'has_signature': 0,
290 |                 'has_tls': 0,
291 |                 'symbols': 0
292 |             }
293 | 
294 |         return {
295 |             'size': len(bytez),
296 |             'vsize': lief_binary.virtual_size,
297 |             'has_debug': int(lief_binary.has_debug),
298 |             'exports': len(lief_binary.exported_functions),
299 |             'imports': len(lief_binary.imported_functions),
300 |             'has_relocations': int(lief_binary.has_relocations),
301 |             'has_resources': int(lief_binary.has_resources),
302 |             'has_signature': int(lief_binary.has_signature),
303 |             'has_tls': int(lief_binary.has_tls),
304 |             'symbols': len(lief_binary.symbols),
305 |         }
306 | 
307 |     def process_raw_features(self, raw_obj):
308 |         return np.asarray([
309 |             raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'],
310 |             raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'],
311 |             raw_obj['symbols']
312 |         ],
313 |                           dtype=np.float32)
314 | 
315 | 
316 | class HeaderFileInfo(FeatureType):
317 |     ''' Machine, architecure, OS, linker and other information extracted from header '''
318 | 
319 |     name = 'header'
320 |     dim = 62
321 | 
322 |     def __init__(self):
323 |         super(FeatureType, self).__init__()
324 | 
325 |     def raw_features(self, bytez, lief_binary):
326 |         raw_obj = {}
327 |         raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []}
328 |         raw_obj['optional'] = {
329 |             'subsystem': "",
330 |             'dll_characteristics': [],
331 |             'magic': "",
332 |             'major_image_version': 0,
333 |             'minor_image_version': 0,
334 |             'major_linker_version': 0,
335 |             'minor_linker_version': 0,
336 |             'major_operating_system_version': 0,
337 |             'minor_operating_system_version': 0,
338 |             'major_subsystem_version': 0,
339 |             'minor_subsystem_version': 0,
340 |             'sizeof_code': 0,
341 |             'sizeof_headers': 0,
342 |             'sizeof_heap_commit': 0
343 |         }
344 |         if lief_binary is None:
345 |             return raw_obj
346 | 
347 |         raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps
348 |         raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1]
349 |         raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list]
350 |         raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1]
351 |         raw_obj['optional']['dll_characteristics'] = [
352 |             str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists
353 |         ]
354 |         raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1]
355 |         raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version
356 |         raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version
357 |         raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version
358 |         raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version
359 |         raw_obj['optional'][
360 |             'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version
361 |         raw_obj['optional'][
362 |             'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version
363 |         raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version
364 |         raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version
365 |         raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code
366 |         raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers
367 |         raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit
368 |         return raw_obj
369 | 
370 |     def process_raw_features(self, raw_obj):
371 |         return np.hstack([
372 |             raw_obj['coff']['timestamp'],
373 |             FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0],
374 |             FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0],
375 |             FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0],
376 |             FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0],
377 |             FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0],
378 |             raw_obj['optional']['major_image_version'],
379 |             raw_obj['optional']['minor_image_version'],
380 |             raw_obj['optional']['major_linker_version'],
381 |             raw_obj['optional']['minor_linker_version'],
382 |             raw_obj['optional']['major_operating_system_version'],
383 |             raw_obj['optional']['minor_operating_system_version'],
384 |             raw_obj['optional']['major_subsystem_version'],
385 |             raw_obj['optional']['minor_subsystem_version'],
386 |             raw_obj['optional']['sizeof_code'],
387 |             raw_obj['optional']['sizeof_headers'],
388 |             raw_obj['optional']['sizeof_heap_commit'],
389 |         ]).astype(np.float32)
390 | 
391 | 
392 | class StringExtractor(FeatureType):
393 |     ''' Extracts strings from raw byte stream '''
394 | 
395 |     name = 'strings'
396 |     dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1
397 | 
398 |     def __init__(self):
399 |         super(FeatureType, self).__init__()
400 |         # all consecutive runs of 0x20 - 0x7f that are 5+ characters
401 |         self._allstrings = re.compile(b'[\x20-\x7f]{5,}')
402 |         # occurances of the string 'C:\'.  Not actually extracting the path
403 |         self._paths = re.compile(b'c:\\\\', re.IGNORECASE)
404 |         # occurances of http:// or https://.  Not actually extracting the URLs
405 |         self._urls = re.compile(b'https?://', re.IGNORECASE)
406 |         # occurances of the string prefix HKEY_.  No actually extracting registry names
407 |         self._registry = re.compile(b'HKEY_')
408 |         # crude evidence of an MZ header (dropper?) somewhere in the byte stream
409 |         self._mz = re.compile(b'MZ')
410 | 
411 |     def raw_features(self, bytez, lief_binary):
412 |         allstrings = self._allstrings.findall(bytez)
413 |         if allstrings:
414 |             # statistics about strings:
415 |             string_lengths = [len(s) for s in allstrings]
416 |             avlength = sum(string_lengths) / len(string_lengths)
417 |             # map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive
418 |             as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)]
419 |             c = np.bincount(as_shifted_string, minlength=96)  # histogram count
420 |             # distribution of characters in printable strings
421 |             csum = c.sum()
422 |             p = c.astype(np.float32) / csum
423 |             wh = np.where(c)[0]
424 |             H = np.sum(-p[wh] * np.log2(p[wh]))  # entropy
425 |         else:
426 |             avlength = 0
427 |             c = np.zeros((96,), dtype=np.float32)
428 |             H = 0
429 |             csum = 0
430 | 
431 |         return {
432 |             'numstrings': len(allstrings),
433 |             'avlength': avlength,
434 |             'printabledist': c.tolist(),  # store non-normalized histogram
435 |             'printables': int(csum),
436 |             'entropy': float(H),
437 |             'paths': len(self._paths.findall(bytez)),
438 |             'urls': len(self._urls.findall(bytez)),
439 |             'registry': len(self._registry.findall(bytez)),
440 |             'MZ': len(self._mz.findall(bytez))
441 |         }
442 | 
443 |     def process_raw_features(self, raw_obj):
444 |         hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0
445 |         return np.hstack([
446 |             raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'],
447 |             np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'],
448 |             raw_obj['registry'], raw_obj['MZ']
449 |         ]).astype(np.float32)
450 | 
451 | 
452 | class DataDirectories(FeatureType):
453 |     ''' Extracts size and virtual address of the first 15 data directories '''
454 | 
455 |     name = 'datadirectories'
456 |     dim = 15 * 2
457 | 
458 |     def __init__(self):
459 |         super(FeatureType, self).__init__()
460 |         self._name_order = [
461 |             "EXPORT_TABLE", "IMPORT_TABLE", "RESOURCE_TABLE", "EXCEPTION_TABLE", "CERTIFICATE_TABLE",
462 |             "BASE_RELOCATION_TABLE", "DEBUG", "ARCHITECTURE", "GLOBAL_PTR", "TLS_TABLE", "LOAD_CONFIG_TABLE",
463 |             "BOUND_IMPORT", "IAT", "DELAY_IMPORT_DESCRIPTOR", "CLR_RUNTIME_HEADER"
464 |         ]
465 | 
466 |     def raw_features(self, bytez, lief_binary):
467 |         output = []
468 |         if lief_binary is None:
469 |             return output
470 | 
471 |         for data_directory in lief_binary.data_directories:
472 |             output.append({
473 |                 "name": str(data_directory.type).replace("DATA_DIRECTORY.", ""),
474 |                 "size": data_directory.size,
475 |                 "virtual_address": data_directory.rva
476 |             })
477 |         return output
478 | 
479 |     def process_raw_features(self, raw_obj):
480 |         features = np.zeros(2 * len(self._name_order), dtype=np.float32)
481 |         for i in range(len(self._name_order)):
482 |             if i < len(raw_obj):
483 |                 features[2 * i] = raw_obj[i]["size"]
484 |                 features[2 * i + 1] = raw_obj[i]["virtual_address"]
485 |         return features
486 | 
487 | 
488 | class PEFeatureExtractor(object):
489 |     ''' Extract useful features from a PE file, and return as a vector of fixed size. '''
490 | 
491 |     def __init__(self, feature_version=1):
492 |         self.features = [
493 |             ByteHistogram(),
494 |             ByteEntropyHistogram(),
495 |             StringExtractor(),
496 |             GeneralFileInfo(),
497 |             HeaderFileInfo(),
498 |             SectionInfo(),
499 |             ImportsInfo(),
500 |             ExportsInfo()
501 |         ]
502 |         if feature_version == 1:
503 |             if not lief.__version__.startswith("0.8.3"):
504 |                 pass
505 |                 #print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75")
506 |                 #print(f"WARNING:   lief version {lief.__version__} found instead. There may be slight inconsistencies")
507 |                 #print(f"WARNING:   in the feature calculations.")
508 |         elif feature_version == 2:
509 |             self.features.append(DataDirectories())
510 |             if not lief.__version__.startswith("0.9.0"):
511 |                 pass
512 |                 #print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-")
513 |                 #print(f"WARNING:   lief version {lief.__version__} found instead. There may be slight inconsistencies")
514 |                 #print(f"WARNING:   in the feature calculations.")
515 |         else:
516 |             raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}")
517 |         self.dim = sum([fe.dim for fe in self.features])
518 | 
519 |     def raw_features(self, bytez):
520 |         lief_errors = (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound,
521 |                        RuntimeError)
522 |         try:
523 |             lief_binary = lief.PE.parse(list(bytez))
524 |         except lief_errors as e:
525 |             print("lief error: ", str(e))
526 |             lief_binary = None
527 |         except Exception:  # everything else (KeyboardInterrupt, SystemExit, ValueError):
528 |             raise
529 | 
530 |         features = {"sha256": hashlib.sha256(bytez).hexdigest()}
531 |         features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features})
532 |         return features
533 | 
534 |     def process_raw_features(self, raw_obj):
535 |         feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features]
536 |         return np.hstack(feature_vectors).astype(np.float32)
537 | 
538 |     def feature_vector(self, bytez):
539 |         return self.process_raw_features(self.raw_features(bytez))
540 | 
541 | 
542 | class PEFeatureReader(object):
543 |     ''' Load features from vectorized .dat files. '''
544 | 
545 |     def read_vectorized_features(self, data_dir, subset=None, feature_version=1):
546 |         """
547 |             Read vectorized features into memory mapped numpy arrays
548 |         """
549 |         if subset is not None and subset not in ["train", "test"]:
550 |             return None
551 | 
552 |         extractor = PEFeatureExtractor(feature_version)
553 |         ndim = extractor.dim
554 |         X_train = None
555 |         y_train = None
556 |         X_test = None
557 |         y_test = None
558 | 
559 |         if subset is None or subset == "train":
560 |             X_train_path = os.path.join(data_dir, "X_train.dat")
561 |             y_train_path = os.path.join(data_dir, "y_train.dat")
562 |             y_train = np.memmap(y_train_path, dtype=np.float32, mode="r")
563 |             N = y_train.shape[0]
564 |             X_train = np.memmap(X_train_path, dtype=np.float32, mode="r", shape=(N, ndim))
565 |             if subset == "train":
566 |                 return X_train, y_train
567 | 
568 |         if subset is None or subset == "test":
569 |             X_test_path = os.path.join(data_dir, "X_test.dat")
570 |             y_test_path = os.path.join(data_dir, "y_test.dat")
571 |             y_test = np.memmap(y_test_path, dtype=np.float32, mode="r")
572 |             N = y_test.shape[0]
573 |             X_test = np.memmap(X_test_path, dtype=np.float32, mode="r", shape=(N, ndim))
574 |             if subset == "test":
575 |                 return X_test, y_test
576 | 
577 |         return X_train, y_train, X_test, y_test
578 | 


--------------------------------------------------------------------------------
/src/functions.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import sys
  4 | import lief
  5 | import json
  6 | import time
  7 | import random
  8 | import shutil
  9 | import joblib
 10 | import zipfile
 11 | import lightgbm
 12 | import requests
 13 | import subprocess
 14 | import numpy as np
 15 | import pandas as pd
 16 | import src.config as cfg
 17 | from hashlib import sha256
 18 | import data.manipulate as m
 19 | from datetime import timedelta
 20 | from data.pefeatures import PEFeatureExtractor
 21 | 
 22 | VT_API_KEY = cfg.file['apiKeys']['vt']
 23 | HA_API_KEY = cfg.file['apiKeys']['ha']
 24 | MD_API_KEY = cfg.file['apiKeys']['md']
 25 | 
 26 | EXCEPTIONS = (MemoryError, lief.bad_file, lief.bad_format, lief.not_found)
 27 | 
 28 | 
 29 | def time_me(start_time):
 30 | 	"""
 31 | 		Timer returning output in following format HH:MM:SS
 32 | 	"""
 33 | 	# Show total time in hh:mm:ss
 34 | 	minutes, seconds = divmod(time.time() - start_time, 60)
 35 | 	hours, minutes = divmod(minutes, 60)
 36 | 	print('\nProcessing time: %02d:%02d:%02d\n' % (hours, minutes, seconds))
 37 | 	return '%02d:%02d:%02d' % (hours, minutes, seconds)
 38 | 
 39 | 
 40 | def readfile(filename):
 41 | 	"""
 42 | 		Convert file into bytes
 43 | 	"""
 44 | 
 45 | 	with open(filename, "rb") as b:
 46 | 		b_bytes = b.read()
 47 | 	return b_bytes
 48 | 
 49 | 
 50 | def unzip_file(zipped_path, unzipped_path):
 51 | 	"""
 52 | 		Unzip downloaded malware with standard industry password
 53 | 	"""
 54 | 
 55 | 	for item in os.listdir(zipped_path):
 56 | 		if item.endswith(".zip"):
 57 | 			full_path = zipped_path + item
 58 | 			zip_file = zipfile.ZipFile(full_path, 'r')
 59 | 			zip_file.setpassword(b"infected")  # Industry password for malware
 60 | 			zip_file.extractall(unzipped_path)
 61 | 			zip_file.close()
 62 | 
 63 | 
 64 | def hash_files(filename):
 65 | 	"""
 66 | 		Return SHA256 of a file
 67 | 	"""
 68 | 
 69 | 	h = sha256()
 70 | 	with open(filename, 'rb', buffering=0) as f:
 71 | 		for b in iter(lambda: f.read(128 * 1024), b''):
 72 | 			h.update(b)
 73 | 	return h.hexdigest()
 74 | 
 75 | 
 76 | def rename_files(files_path):
 77 | 	"""
 78 | 		Rename files with SHA256 value
 79 | 	"""
 80 | 
 81 | 	for item in os.listdir(files_path):
 82 | 		files = files_path + item
 83 | 		sha = hash_files(files)
 84 | 		os.rename(files, files_path + sha)
 85 | 
 86 | 
 87 | def url_ok(url):
 88 | 	"""
 89 | 		Check URL status
 90 | 	"""
 91 | 
 92 | 	r = requests.get(url, timeout=10)
 93 | 	return r.status_code
 94 | 
 95 | 
 96 | def create_sequential_actions(size_of_actions, n):
 97 | 	"""
 98 | 		Return vector filled with sequential perturbations
 99 | 		e.g:
100 | 			for n = 4 and size_of_actions = 10
101 | 
102 | 			[0, 0, 0, 0]
103 | 			[0, 0, 0, 1]
104 | 			[0, 0, 0, 2]
105 | 			...
106 | 			[9, 9, 9, 9]
107 | 	"""
108 | 
109 | 	sequential_actions = []
110 | 	string_format_n = "{0:0" + str(n) + "}"
111 | 	cases_generated = [string_format_n.format(i) for i in range(size_of_actions ** n)]
112 | 
113 | 	for i in range(len(cases_generated)):
114 | 		sequential_actions.append([int(s) for s in cases_generated[i]])
115 | 
116 | 	return sequential_actions
117 | 
118 | 
119 | def create_random_actions(size_of_actions, n):
120 | 	"""
121 | 		Return vector filled with random perturbations
122 | 	"""
123 | 
124 | 	random.seed()
125 | 	random_actions = random.sample(range(size_of_actions), n)
126 | 	return random_actions
127 | 
128 | 
129 | def actions_vector(actions_dict):
130 | 	"""
131 | 		Creating a dict with all available perturbations
132 | 	"""
133 | 
134 | 	actions = {i: act for i, act in enumerate(actions_dict)}
135 | 	return actions
136 | 
137 | 
138 | def build_bytes(input_bytes, total_number_perturbations):
139 | 	"""
140 | 		Compile a malware mutation after perturbations are injected
141 | 
142 | 		Input:
143 | 			input_bytes: input malware in bytes
144 | 			total_number_perturbations: number of perturbations injected to keep track in name
145 | 	"""
146 | 
147 | 	try:
148 | 		new_binary = lief.PE.parse(list(input_bytes))
149 | 		builder = lief.PE.Builder(new_binary)
150 | 		builder.build_imports(True)
151 | 		builder.patch_imports(True)
152 | 		builder.build()
153 | 		name_mod_file = cfg.file['paths']['mod'] + str(total_number_perturbations) + '_m.exe'
154 | 		builder.write(name_mod_file)
155 | 
156 | 	except EXCEPTIONS as e:
157 | 		print("When parsing & building returned the following error:", str(e))
158 | 		return None
159 | 
160 | 	return name_mod_file
161 | 
162 | 
163 | def rec_mod_files(input_bytes, actions, chosen_actions, inject_perturbation):
164 | 	"""
165 | 		Recursive function to inject perturbations to input malware sample
166 | 
167 | 		Input:
168 | 			input_bytes: input malware in bytes
169 | 			actions: all possible perturbations
170 | 			chosen_actions: vector of perturbations to inject
171 | 			inject_perturbation: perturbation being injected on this iteration
172 | 	"""
173 | 
174 | 	if inject_perturbation == -1:
175 | 		return build_bytes(input_bytes, len(chosen_actions))
176 | 	else:
177 | 		try:
178 | 			manipulator = m.MalwareManipulator(input_bytes)
179 | 			next_action = actions[chosen_actions[inject_perturbation]]
180 | 			inject_action = manipulator.__getattribute__(next_action)
181 | 			mod_bytes = inject_action(input_bytes)
182 | 
183 | 		except EXCEPTIONS as e:
184 | 			print('When injecting perturbation returned the error: ', e)
185 | 			return None
186 | 
187 | 		return rec_mod_files(mod_bytes, actions, chosen_actions, inject_perturbation - 1)
188 | 
189 | 
190 | # CALCULATE DIFFERENCE BETWEEN TWO PEs
191 | 
192 | 
193 | def get_difference(sample1, sample2):
194 | 	"""
195 | 		Calculate the difference between two PE:
196 | 
197 | 		Input:
198 | 			sample1: original sample S
199 | 			sample2: mutation S'
200 | 	"""
201 | 
202 | 	s1_bytes = readfile(sample1)
203 | 	s2_bytes = readfile(sample2)
204 | 	try:
205 | 		# Use -n to compare only until smallest file ends to avoid EOF message
206 | 		compare_samples = subprocess.Popen(
207 | 			['cmp', '-l', '-n' + str(min(len(s1_bytes), len(s2_bytes))), sample1, sample2],
208 | 			stdout=subprocess.PIPE)
209 | 		out_compare_samples, err_compare_samples = compare_samples.communicate()
210 | 
211 | 	except subprocess.CalledProcessError as e:
212 | 		raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
213 | 
214 | 	compare_samples.kill()
215 | 	return len(out_compare_samples)
216 | 
217 | 
218 | # API MANAGEMENT: VIRUS TOTAL & HYBRID ANALYSIS & METADEFENDER
219 | 
220 | 
221 | def check_API_key(api_key):
222 | 	"""
223 | 		Check whether an API key is given before using an external service
224 | 	"""
225 | 	if api_key == '':
226 | 		sys.exit('\nProvide an API key to use this service.\n')
227 | 	return 1
228 | 
229 | 
230 | def get_user_quotas_VT():
231 | 	"""
232 | 		APIv3 implementation to get request quotas of user
233 | 	"""
234 | 
235 | 	url = 'https://www.virustotal.com/api/v3/users'
236 | 	headers = {'x-apikey': VT_API_KEY, 'Accept': 'application/json'}
237 | 	response = requests.get(url + '/{}'.format(VT_API_KEY), headers=headers)
238 | 	json_response = response.json()
239 | 	request_rate = json_response['data']['attributes']['quotas']['api_requests_hourly']['allowed']
240 | 	return request_rate
241 | 
242 | 
243 | def send_VT(sample):
244 | 	"""
245 | 		APIv3 implementation to send a file for analysis using VirusTotal
246 | 
247 | 		Input:
248 | 			sample: malware that will be labeled
249 | 	"""
250 | 
251 | 	# Check API key given
252 | 	check_API_key(VT_API_KEY)
253 | 
254 | 	url = 'https://www.virustotal.com/api/v3/files'
255 | 	headers = {'x-apikey': VT_API_KEY, 'Accept': 'application/json'}
256 | 	files = {'file': (sample, open(sample, 'rb'))}
257 | 	response = requests.post(url, headers=headers, files=files)
258 | 	json_response = response.json()
259 | 	return json_response
260 | 
261 | 
262 | def get_report_VT(file_hash, rescan=False):
263 | 	"""
264 | 		APIv3 implementation to retrieve report from a file analyzed using VirusTotal
265 | 
266 | 		Input:
267 | 			file_hash: sample of malware to retrieve
268 | 			rescan: boolean option to rescan file in case it is previously detected
269 | 	"""
270 | 
271 | 	# Check API key given
272 | 	check_API_key(VT_API_KEY)
273 | 
274 | 	requests_allowed_minute = get_user_quotas_VT() / 60
275 | 	url = 'https://www.virustotal.com/api/v3/files'
276 | 	headers = {'x-apikey': VT_API_KEY, 'Accept': 'application/json'}
277 | 	querystring = {'limit': '10'}
278 | 
279 | 	if rescan:
280 | 		response = requests.post(url + '/{}/analyse'.format(file_hash), headers=headers, params=querystring)
281 | 		return response.json()
282 | 	else:
283 | 		attempts = 0
284 | 		while attempts < requests_allowed_minute:
285 | 
286 | 			response = requests.get(url + '/{}'.format(file_hash), headers=headers, params=querystring)
287 | 
288 | 			if response.status_code == 404:
289 | 				time_to_sleep = (1 if 60 / requests_allowed_minute < 1 else 60 / requests_allowed_minute)
290 | 				print("Sample is not on VirusTotal. Waiting {} s..".format(time_to_sleep))
291 | 				time.sleep(time_to_sleep)
292 | 
293 | 			elif response.status_code != 200:
294 | 				print(
295 | 					'\nVirusTotal returned server error {} while requesting scan report. Probably API issues. Exiting '
296 | 					'application until solved.\n'.format(
297 | 						response.status_code))
298 | 				sys.exit()
299 | 
300 | 			else:
301 | 				json_response = response.json()
302 | 				return json_response
303 | 
304 | 			attempts += 1
305 | 
306 | 		sys.exit("VirusTotal processing is taking too long. Timing out.")
307 | 
308 | 
309 | def get_report_VT_ext(file_hash, json_dest_path):
310 | 	"""
311 | 		Detecting malware samples using VirusTotal APIv3 (remote)
312 | 
313 | 		Input:
314 | 			sample_report: the number of VT detections to use as benchmark
315 | 	"""
316 | 
317 | 	print('\nDetection for sample:', file_hash)
318 | 
319 | 	try:
320 | 		# Get VirusTotal detections - Rescan: False
321 | 		report = get_report_VT(file_hash, False)
322 | 		report_stats = report['data']['attributes']['last_analysis_stats']
323 | 		report_results = report['data']['attributes']['last_analysis_results']
324 | 
325 | 		# Check reported status of sample
326 | 		detected = report_stats['malicious']
327 | 		undetected = report_stats['undetected']
328 | 		total = detected + undetected
329 | 		print('\nDetected by {} out of {} engines. \n'.format(detected, total))
330 | 
331 | 		# Print only engines detecting new sample
332 | 		engines_detecting = {key: val for key, val in report_results.items() if val['category'] == 'malicious'}
333 | 		print(list(engines_detecting.keys()))
334 | 
335 | 		# Label as malicious if most of engines do so
336 | 		detection = (1 if detected / total > 0.5 else 0)
337 | 
338 | 		# Provide link to sample detections report
339 | 		# print('\n{}'.format(report['data']['links']['self']))
340 | 
341 | 		# Save json file
342 | 		with open(json_dest_path.format(file_hash), 'w') as json_file:
343 | 			json.dump(report, json_file)
344 | 
345 | 		return detection
346 | 
347 | 	except (requests.ConnectionError, requests.Timeout, requests.ConnectTimeout) as e:
348 | 		print('Connection issues or API requests threshold reached: {}'.format(e))
349 | 
350 | 
351 | def send_MD(sample):
352 | 	"""
353 | 		APIv2 implementation to send a file for analysis using MetaDefender
354 | 
355 | 		Input:
356 | 			sample: malware that will be labeled
357 | 	"""
358 | 
359 | 	# Check API key given
360 | 	check_API_key(MD_API_KEY)
361 | 
362 | 	headers = {'apikey': MD_API_KEY}
363 | 	files = {'file': (sample, open(sample, 'rb'))}
364 | 	response = requests.post('https://api.metadefender.com/v2/file', headers=headers, files=files)
365 | 	json_response = response.json()
366 | 	return json_response
367 | 
368 | 
369 | def get_report_MD(data_id):
370 | 	"""
371 | 		APIv2 implementation to retrieve report from a file analyzed using MetaDefender
372 | 	"""
373 | 
374 | 	# Check API key given
375 | 	check_API_key(MD_API_KEY)
376 | 
377 | 	headers = {'apikey': MD_API_KEY}
378 | 	response = requests.get('https://api.metadefender.com/v2/file/' + data_id, headers=headers)
379 | 	json_response = response.json()
380 | 	return json_response
381 | 
382 | 
383 | def send_HA(sample, environment_id):
384 | 	"""
385 | 		APIv2 implementation to send a file for analysis using Hybrid Analysis
386 | 
387 | 		Input:
388 | 			sample: malware that will be labeled
389 | 			environment_id: OS used to run malware sample (ID = 120: Windows7 - 64 bits)
390 | 	"""
391 | 
392 | 	# Check API key given
393 | 	check_API_key(HA_API_KEY)
394 | 
395 | 	f = open(sample, 'rb')
396 | 	headers = {'User-agent': 'Falcon Sandbox', 'api-key': HA_API_KEY}
397 | 	data = {'environment_id': environment_id, 'no_share_third_party': True, 'allow_community_access': False}
398 | 	files = {'file': f}
399 | 
400 | 	try:
401 | 		submitUrl = 'https://www.hybrid-analysis.com/api/v2/submit/file'
402 | 		res = requests.post(submitUrl, headers=headers, data=data, files=files)
403 | 		if res.status_code == 200 or res.status_code == 201:
404 | 			print("\nFile successfully submitted to analysis: {}".format(os.path.basename(sample)))
405 | 			f.close()
406 | 			return res.json()
407 | 		else:
408 | 			print("Error code: {}, returned when uploading: {}".format(res.status_code, f.name))
409 | 			return res.status_code
410 | 
411 | 	except requests.exceptions.HTTPError as err:
412 | 		print(err.read())
413 | 		err.print_exc()
414 | 
415 | 
416 | def get_report_HA(file_hash):
417 | 	"""
418 | 		APIv2 implementation to retrieve report from a file analyzed using Hybrid Analysis
419 | 	"""
420 | 
421 | 	# Check API key given
422 | 	check_API_key(HA_API_KEY)
423 | 
424 | 	# Adjusted the API from params to headers to send the api-key
425 | 	headers = {'User-agent': 'Falcon Sandbox', 'api-key': HA_API_KEY}
426 | 	# EnvironmentID = 120 and needs to be implemented as '%3A120'
427 | 	res = requests.get('https://www.hybrid-analysis.com/api/v2/report/' + file_hash + '%3A120' + '/summary',
428 | 					   headers=headers)
429 | 	json_res = res.json()
430 | 	return json_res
431 | 
432 | 
433 | # API MANAGEMENT: LOCAL SANDBOX (CUCKOO)
434 | 
435 | 
436 | def send_local_sandbox(sample):
437 | 	"""
438 | 		API implementation to send a file for analysis using Cuckoo sandbox (local)
439 | 
440 | 		Input:
441 | 			sample: malware that will be labeled
442 | 	"""
443 | 
444 | 	submitUrl = "http://localhost:8090/tasks/create/file"
445 | 	data = {'timeout': '30'}
446 | 	with open(sample, "rb") as sample:
447 | 		files = {"file": ("new_mutation", sample)}
448 | 		r = requests.post(submitUrl, data=data, files=files)
449 | 
450 | 	try:
451 | 		if r.status_code == 200:
452 | 			# print("\nFile successfully submitted to analysis: {}".format(os.path.basename(sample)))
453 | 			sample.close()
454 | 			return r.json()
455 | 		else:
456 | 			print("Error code: {}, returned when submitting.".format(r.status_code))
457 | 			return r.status_code
458 | 
459 | 	except requests.exceptions.HTTPError as err:
460 | 		print(err.read())
461 | 		err.print_exc()
462 | 
463 | 
464 | def get_report_local_sandbox(id_report, option):
465 | 	"""
466 | 		API implementation to retrieve report from a file analyzed using the Cuckoo sandbox
467 | 	"""
468 | 
469 | 	# Options: view = short report | report = extensive report
470 | 	if option == 'view':
471 | 		r = requests.get('http://localhost:8090/tasks/view/' + str(id_report))
472 | 	else:
473 | 		r = requests.get('http://localhost:8090/tasks/report/' + str(id_report))
474 | 	return r.json()
475 | 
476 | 
477 | # DATABASE.CSV CREATION & UPDATE
478 | 
479 | 
480 | def collect_info_CSV(sample, sample_report, number_perturbations, chosen_actions, mod_sample_hash, hash_sample):
481 | 	"""
482 | 		Collect info on dict and prepare to save on CSV
483 | 
484 | 		Input:
485 | 			sample: name of malware mutation
486 | 			sample_report: detection rate of mutation (positive/total detections)
487 | 			number_perturbations: number of perturbations injected
488 | 			chosen_actions: vector with perturbations injected to create malware mutation
489 | 			mod_sample_hash: SHA256 value of malware mutation
490 | 			hash_sample: SHA256 value of original malware provided as input
491 | 	"""
492 | 
493 | 	CSV = {'Original_File': sample, 'OF_Detections': str(sample_report['positives']) + '/' + str(
494 | 		sample_report['total']), 'Perturbations': str(number_perturbations),
495 | 		   'Perturbations_Injected': chosen_actions[:number_perturbations], 'Mod_File_Hash': mod_sample_hash,
496 | 		   'Original_File_Hash': hash_sample}
497 | 	return CSV
498 | 
499 | 
500 | def write_dict_CSV(csv_file, CSV, fields):
501 | 	"""
502 | 		Function to save dict into CSV file
503 | 
504 | 		Input:
505 | 			csv_file: CSV file to create
506 | 			CSV: dict with values to store
507 | 			fields: pre-defined column names
508 | 	"""
509 | 
510 | 	try:
511 | 		if not os.path.isfile(csv_file):
512 | 			with open(csv_file, 'w') as fi:
513 | 				writer = csv.DictWriter(fi, fieldnames=fields)
514 | 				writer.writeheader()
515 | 				writer.writerow(CSV)
516 | 		else:
517 | 			with open(csv_file, 'a') as fi:
518 | 				writer = csv.DictWriter(fi, fieldnames=fields, extrasaction='ignore')
519 | 				writer.writerow(CSV)
520 | 
521 | 	except IOError as err:
522 | 		print("Exception: {}".format(err))
523 | 
524 | 
525 | # 			TABLE CREATION FOR COMPARISON BETWEEN ARMED & AIMED
526 | 
527 | 
528 | def time_to_seconds(data, new_df_cols=None, original_csv_cols=None):
529 | 	"""
530 | 		Convert time in data.csv from hh:mm:ss to s
531 | 
532 | 		Input:
533 | 			data: input CSV file
534 | 			new_df_cols: columns for new dataframe used for format conversion (optional)
535 | 			original_csv_cols: pre-defined columns in original input CSV (optional)
536 | 	"""
537 | 
538 | 	if new_df_cols is None:
539 | 		new_df_cols = ['Perturbations', 'Files M1', 'Time M1', 'Time M2']
540 | 	if original_csv_cols is None:
541 | 		original_csv_cols = ['Sample', 'Perturbations', 'Module 1', 'Time M1', 'Files M1', 'Corr M1', 'Module 2',
542 | 							 'Time M2', 'Files M2', 'Corr M2', 'Total Time']
543 | 	time_seconds = pd.DataFrame(columns=new_df_cols)
544 | 	csv_panda = pd.read_csv(data, names=original_csv_cols)
545 | 	for i in range(1, len(csv_panda)):
546 | 		x = time.strptime(csv_panda['Time M1'][i].split(',')[0], '%H:%M:%S')
547 | 		y = time.strptime(csv_panda['Time M2'][i].split(',')[0], '%H:%M:%S')
548 | 		time_seconds.loc[len(time_seconds)] = [csv_panda['Perturbations'][i], csv_panda['Files M1'][i],
549 | 											   timedelta(hours=x.tm_hour, minutes=x.tm_min,
550 | 														 seconds=x.tm_sec).total_seconds(),
551 | 											   timedelta(hours=y.tm_hour, minutes=y.tm_min,
552 | 														 seconds=y.tm_sec).total_seconds()]
553 | 
554 | 	return time_seconds
555 | 
556 | 
557 | def sum_times(data, col_time):
558 | 	"""
559 | 		Calculate from data the sum of time elapsed processing ARMED & AIMED
560 | 
561 | 		Input:
562 | 			data: pd.Dataframe with time information
563 | 			col_time: column with time values (e.g., col_time='Time M1')
564 | 	"""
565 | 
566 | 	sum_time_elapsed = {}
567 | 	for i in range(1, len(data)):
568 | 		if (data['Files M1'][i]) in sum_time_elapsed.keys():
569 | 			ext_sum = sum_time_elapsed[(data['Files M1'][i])] + data[col_time][i]
570 | 			sum_time_elapsed.update({(data['Files M1'][i]): ext_sum})
571 | 		else:
572 | 			sum_time_elapsed[(data['Files M1'][i])] = data['Time M1'][i]
573 | 
574 | 	return sum_time_elapsed
575 | 
576 | 
577 | def average_times(number_files_grouped_AXMED, sum_times_files_ARMED, sum_times_files_AIMED, csv_file=None, save=False):
578 | 	"""
579 | 		Create dict with number of mutations generated and time processed in average
580 | 		for ARMED (column 1) and AIMED (column 2)
581 | 
582 | 		Input:
583 | 			number_files_grouped_AXMED: group with sum of all instances of times with same number of files created
584 | 			sum_times_files_ARMED: sum of all instances of times with same number of files created for ARMED
585 | 			sum_times_files_AIMED: sum of all instances of times with same number of files created for AIMED
586 | 			csv_file: input csv file (optional)
587 | 			save: boolean value to confirm whether to save results (default: False)
588 | 	"""
589 | 
590 | 	average_times_ARMED = {}
591 | 	average_times_AIMED = {}
592 | 	for k, v in sum_times_files_ARMED.items():
593 | 		average_times_ARMED.update({k: round(sum_times_files_ARMED[k] / number_files_grouped_AXMED[k])})
594 | 		average_times_AIMED.update({k: round(sum_times_files_AIMED[k] / number_files_grouped_AXMED[k])})
595 | 
596 | 	# Convert all items, keys (strings) and values (pd.Dataframe) to int
597 | 	average_times_ARMED = {int(k): int(v) for k, v in average_times_ARMED.items()}
598 | 	average_times_AIMED = {int(k): int(v) for k, v in average_times_AIMED.items()}
599 | 
600 | 	list_avg_times_ARMED = sorted(average_times_ARMED.items())
601 | 	list_avg_times_AIMED = sorted(average_times_AIMED.items())
602 | 
603 | 	if save:
604 | 		with open('support_armed_times.csv', 'a') as f:
605 | 			writer = csv.writer(f)
606 | 			for row_i in list_avg_times_ARMED:
607 | 				writer.writerow(row_i)
608 | 			f.close()
609 | 
610 | 		# Remove existing file to avoid adding duplicated data
611 | 		if os.path.exists(csv_file):
612 | 			os.remove(csv_file)
613 | 
614 | 		i = 0
615 | 		with open('support_armed_times.csv', 'r') as fin:
616 | 			with open(csv_file, 'a') as file_out:
617 | 				writer = csv.writer(file_out)
618 | 				for row in csv.reader(fin):
619 | 					writer.writerow(row + [list_avg_times_AIMED[i][1]])
620 | 					i += 1
621 | 				fin.close()
622 | 				file_out.close()
623 | 
624 | 		# armed_times.csv is used as support to create csv_file with ARMED & AIMED times
625 | 		os.remove('support_armed_times.csv')
626 | 
627 | 	return average_times_ARMED, average_times_AIMED
628 | 
629 | 
630 | def comparing_AXMED():
631 | 	"""
632 | 		Create a CSV to be used directly in LaTeX with comparison between
633 | 		processing times of ARMED & AIMED
634 | 	"""
635 | 
636 | 	# Prepare data to compare processing times between ARMED & AIMED
637 | 	AXMED_seconds = time_to_seconds('db/compare.csv')
638 | 
639 | 	# Sum all instances of times with same number of files created
640 | 	sum_times_files_ARMED = sum_times(AXMED_seconds, 'Time M1')
641 | 	sum_times_files_AIMED = sum_times(AXMED_seconds, 'Time M2')
642 | 
643 | 	# Group all lines with the same value of files / mutations generated
644 | 	number_files_grouped_AXMED = AXMED_seconds.groupby('Files M1').size()
645 | 
646 | 	# Retrieve a csv file with 3 columns: 1) files generated 2) times ARMED and 3) times AIMED
647 | 	average_times(number_files_grouped_AXMED, sum_times_files_ARMED, sum_times_files_AIMED,
648 | 				  csv_file='db/compare_armed_aimed.csv', save=True)
649 | 
650 | 
651 | # 			GET MALWARE SCORE USING PRE-TRAINED MODELS
652 | 
653 | def load_av(filename):
654 | 	"""
655 | 		Load pre-saved model (lgb or pickle).
656 | 
657 | 		Input:
658 | 			filename: model with .pkl extension
659 | 	"""
660 | 	# Convert to joblib (.pkl) if lgb model (.txt) in the input
661 | 	if filename.endswith(".txt"):
662 | 		bst = lightgbm.Booster(model_file=filename)
663 | 		new_filename = filename[:-4] + ".pkl"
664 | 		joblib.dump(bst, new_filename)
665 | 		loaded_model = joblib.load(new_filename)
666 | 	else:
667 | 		loaded_model = joblib.load(filename)
668 | 	return loaded_model
669 | 
670 | 
671 | def get_score_local(sample_bytes, model, top_features_path=''):
672 | 	"""
673 | 		Extract features from malware and get score using pre-saved model
674 | 		Ver.2: PEFeatureExtractor from EMBER dataset with 2381 features
675 | 
676 | 		Input:
677 | 			sample_bytes: malware example
678 | 			model: ML-based model (i.e., LightGBM)
679 | 			top_features_path: path to NPZ with index of top features (Optional)
680 | 	"""
681 | 
682 | 	# Handle LightGBM exception if different version of features used during training & testing (v1=2351 & v2=2381)
683 | 	if model.num_feature() == 2351:
684 | 		feature_extractor = PEFeatureExtractor(feature_version=1)
685 | 	elif model.num_feature() == 2381:
686 | 		feature_extractor = PEFeatureExtractor(feature_version=2)
687 | 	else:
688 | 		sys.exit('Number of features known are v1:2351 and v2:2381. Features detected: {}'.format(model.num_feature()))
689 | 
690 | 	# Extract features of adversarial example
691 | 	features = feature_extractor.feature_vector(sample_bytes)
692 | 
693 | 	# Optionally: Get score using reduced number of features (based on top20% of highest modified features of Logit model)
694 | 	if len(top_features_path) > 0:
695 | 		top_features = np.load(top_features_path)
696 | 		features = features[top_features['arr_0']]
697 | 
698 | 	# Get malicious score from a single malware example
699 | 	score = model.predict([features])[0]
700 | 
701 | 	return score
702 | 
703 | 
704 | # UAP related modules: Convert problem-space to feature-space dataset, exploration set, etc.
705 | 
706 | 
707 | def save_features_malware(csv_path, features_path, pert_vector):
708 | 	"""
709 | 		Saving features from adversarial examples (=evasion) of problem-space malware
710 | 
711 | 		Input:
712 | 			csv_path: path to the CSV file
713 | 			features_path: path to the extracted features from files
714 | 			pert_vector: perturbation vector injected
715 | 	"""
716 | 
717 | 	feature_extractor = PEFeatureExtractor()
718 | 	orig_features = []
719 | 	mod_features = []
720 | 
721 | 	with open(csv_path + 'evasion.csv') as csv_file:
722 | 
723 | 		dict_read = csv.DictReader(csv_file)
724 | 		for row in dict_read:
725 | 
726 | 			# Ignoring malware with LIEF errors
727 | 			if row['Original_File'][21:25] == 'LIEF':
728 | 				continue
729 | 
730 | 			# print(row['Original_File'], row['Manipulated_File'])
731 | 
732 | 			try:
733 | 				orig_bin_bytes = readfile(row['Original_File'])
734 | 				mod_bin_bytes = readfile(row['Manipulated_File'])
735 | 			except OSError as e:
736 | 				print(e)
737 | 
738 | 			orig_current_features = np.array(feature_extractor.feature_vector(orig_bin_bytes), dtype=np.float32)
739 | 			mod_current_features = np.array(feature_extractor.feature_vector(mod_bin_bytes), dtype=np.float32)
740 | 
741 | 			orig_features.append(orig_current_features)
742 | 			mod_features.append(mod_current_features)
743 | 
744 | 		orig_features = np.array(orig_features)
745 | 		mod_features = np.array(mod_features)
746 | 
747 | 	np.savez_compressed(features_path + 'orig_files_uap_compress'.format(pert_vector), features=orig_features)
748 | 	np.savez_compressed(features_path + 'adv_examples_uap_compress'.format(pert_vector), features=mod_features)
749 | 
750 | 	orig_loaded = np.load(features_path + 'orig_files_uap_compress.npz'.format(pert_vector))
751 | 	mod_loaded = np.load(features_path + 'adv_examples_uap_compress.npz'.format(pert_vector))
752 | 
753 | 	# print('\nFeatures from original & modified problem-space malware saved.\n')
754 | 	# print('Orig:', orig_loaded['features'], len(orig_loaded['features']))
755 | 	# print('Mod:', mod_loaded['features'], len(mod_loaded['features']))
756 | 
757 | 	assert np.array_equal(orig_features, orig_loaded['features']), 'Different sizes!'
758 | 	assert np.array_equal(mod_features, mod_loaded['features']), 'Different sizes!'
759 | 
760 | 
761 | # print('\nCompressed and original versions are equal in size: Checked')
762 | 
763 | 
764 | def copy_files_csv(csv_path='', dest_path=''):
765 | 	"""
766 | 		Copying specific examples by parsing (adversarial | detected | corrupt)
767 | 		CSV files.
768 | 
769 | 		Input:
770 | 			csv_path: path to the CSV file
771 | 			dest_path: destination path
772 | 	"""
773 | 	# Ensure directory exist
774 | 	os.makedirs(os.path.dirname(dest_path), exist_ok=True)
775 | 
776 | 	file_counter = 0
777 | 	with open(csv_path) as csv_file:
778 | 		dict_read = csv.DictReader(csv_file)
779 | 		for row in dict_read:
780 | 			print(row['Original_File'], row['Manipulated_File'])
781 | 			shutil.copyfile(row['Original_File'], dest_path + str(file_counter))
782 | 			file_counter += 1
783 | 
784 | 
785 | def create_exploration_validation_set(o_path='', e_path='', v_path='', threshold=0.9, model='data/lgbm_ember.pkl'):
786 | 	"""
787 | 		Create exploration & validation sets to use during greedy-process of UAP search.
788 | 
789 | 		Input:
790 | 			o_path: origin path, pool of malware to sample from
791 | 			e_path: exploration path
792 | 			v_path: validation path
793 | 	"""
794 | 	exploration_files = 100
795 | 	validation_files = 1000
796 | 
797 | 	# Ensure directories exist
798 | 	os.makedirs(os.path.dirname(e_path), exist_ok=True)
799 | 	os.makedirs(os.path.dirname(v_path), exist_ok=True)
800 | 
801 | 	# Load LightGBM model
802 | 	av_model = load_av(model)
803 | 
804 | 	number_samples = 0
805 | 	for sample in os.listdir(o_path):
806 | 		bin_bytes = readfile(o_path + sample)
807 | 		score = get_score_local(bin_bytes, av_model)
808 | 
809 | 		# Collect {validation_files} *different* detected samples for UAP process
810 | 		if number_samples < validation_files:
811 | 			if score > threshold:
812 | 				number_samples += 1
813 | 				shutil.copyfile(o_path + sample, v_path + sample)
814 | 				print('Validation set: Malware {} detected & copied ({})'.format(number_samples, round(score, 2)))
815 | 
816 | 		# Collect {exploration_files} *different* detected samples for UAP process
817 | 		elif validation_files <= number_samples < exploration_files + validation_files:
818 | 			if score > threshold and sample not in os.listdir(v_path):
819 | 				number_samples += 1
820 | 				shutil.copyfile(o_path + sample, e_path + sample)
821 | 				print('Exploration set: Malware {} detected & copied ({})'.format(number_samples-validation_files, round(score, 2)))
822 | 		else:
823 | 			sys.exit('\nExploration & Validation sets correctly created.')
824 | 
825 | 
826 | def clean_cuckoo_analyses_folder(path='.cuckoo/storage/analyses'):
827 | 	"""
828 | 		Delete analysis folder to spare storage.
829 | 
830 | 		Input:
831 | 			path: [default] path to Cuckoo
832 | 	"""
833 | 
834 | 	path_analyses = os.path.join(os.path.expanduser('~'), path)
835 | 	for file in sorted(os.listdir(path_analyses))[:-5]:
836 | 		filename = os.path.join(path_analyses, file)
837 | 		shutil.rmtree(filename)
838 | 	return
839 | 


--------------------------------------------------------------------------------
/src/rl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # AIMED-RL: Automatic Intelligent Malware modifications to Evade Detection - with Reinforcement Learning
  4 | import numpy as np
  5 | import random
  6 | import distutils
  7 | from enum import Enum
  8 | import csv
  9 | 
 10 | import chainer
 11 | import chainerrl
 12 | import chainer.links as L
 13 | import chainer.functions as F
 14 | from chainerrl.initializers import LeCunNormal
 15 | from chainerrl.distribution import SoftmaxDistribution
 16 | from chainerrl.action_value import DiscreteActionValue
 17 | from chainerrl.optimizers import rmsprop_async
 18 | from chainerrl import links
 19 | from chainerrl.replay_buffers import EpisodicReplayBuffer
 20 | 
 21 | import gym
 22 | from gym import spaces
 23 | 
 24 | import src.functions as f
 25 | from collections import OrderedDict
 26 | import data.pefeatures as pefeatures
 27 | import os
 28 | from datetime import datetime
 29 | import data.manipulate as m
 30 | from time import time
 31 | 
 32 | ACTIONS = f.actions_vector(m.ACTION_TABLE.keys())
 33 | 
 34 | # Reward Weight Distributions:
 35 | STANDARD_WEIGHTS = [0.33, 0.33, 0.33]
 36 | INCREMENT_WEIGHTS = [0.5, 0.2, 0.3]  # decrease importance of similarity
 37 | 
 38 | 
 39 | # Class based on OpenAI Gym Environment and Gym Malware (https://github.com/endgameinc/gym-malware/)
 40 | class MalwareEnv(gym.Env):
 41 |     metadata = {'render.modes': ['human']}
 42 | 
 43 |     def __init__(self, malware_list, detection_function, analysis_function):
 44 |         random.seed(PARAM_DICT["seed"])
 45 |         self.malware_list = malware_list
 46 |         self.used_malware = []
 47 |         self.actions = ACTIONS
 48 |         self.action_space = spaces.Discrete(len(ACTIONS))
 49 |         self.actions_taken = []
 50 | 
 51 |         self.max_turns = PARAM_DICT["max_turns"]
 52 | 
 53 |         self.strategy_reset = PARAM_DICT["strategy_reset"]  # Reset actions after half
 54 |         self.strategy_inject = PARAM_DICT["strategy_inject"]  # Inject actions randomly after half
 55 |         assert not (self.strategy_reset and self.strategy_inject)  # Only one strategy possible at a time
 56 | 
 57 |         self.turns = 0
 58 | 
 59 |         # Reward weights:
 60 |         self.reward_weights = PARAM_DICT["weights"]
 61 |         assert np.sum(self.reward_weights) <= 1.0  # Sum of weights must not be bigger than 1
 62 |         self.detected_weight = self.reward_weights[0]
 63 |         self.similarity_weight = self.reward_weights[1]
 64 |         self.distance_weight = self.reward_weights[2]
 65 | 
 66 |         self.reward_punishment = PARAM_DICT["reward_punishment"]
 67 | 
 68 |         self.history = OrderedDict()
 69 | 
 70 |         # Functions:
 71 |         self.detector_function = detection_function
 72 |         self.functionality_function = lambda: (random.randint(0, 10), 0)  # analysis_function
 73 |         self.similarity_function = f.get_difference
 74 | 
 75 |         # Malware Features:
 76 |         self.feature_extractor = pefeatures.PEFeatureExtractor()
 77 |         self.current_malware = None
 78 |         self.current_manipulation = None
 79 |         self.original_bytez = None
 80 |         self.feature_space = None
 81 | 
 82 |     def step(self, action_index):
 83 |         # Apply strategy - Reset actions after half:
 84 |         if self.strategy_reset and self.turns == self.max_turns / 2:
 85 |             self.actions_taken = []
 86 |             self.current_manipulation = self.current_malware
 87 |             self.history[self.current_malware].append("RESET")
 88 | 
 89 |         self.turns += 1
 90 | 
 91 |         # Manipulate Malware:
 92 |         try:
 93 |             bytez = self._take_action(action_index)
 94 |             # Update State
 95 |             self.feature_space = self.feature_extractor.feature_vector(bytez)
 96 |         except Exception as e:  # PE Manipulation Errors
 97 |             print('Exception raised:', e)
 98 |             reward = 0
 99 |             episode_over = True
100 |             return np.asarray(self.feature_space), reward, episode_over, \
101 |                    {"detected": False, "detected_confidence": 0, "errored": True}
102 | 
103 |         reward, detected, detected_confidence = self._calculate_reward()
104 | 
105 |         max_turns_reached = False
106 |         if self.turns >= self.max_turns:
107 |             # reward = 0.0 Temporarily removed for last training
108 |             max_turns_reached = True
109 | 
110 |         # print("Detected: ", detected)
111 |         # print("Max_turns_reached:", max_turns_reached)
112 | 
113 |         episode_over = max_turns_reached or not detected
114 |         # print("Episode over: ", episode_over)
115 | 
116 |         # Info may not be used by RL_Agent
117 |         info = {"detected": detected, "detected_confidence": detected_confidence, "errored": False}
118 |         return np.asarray(self.feature_space), reward, episode_over, info
119 | 
120 |     def get_random_action(self):
121 |         action = random.randrange(0, len(self.actions))
122 |         print("Random action: " + self.actions[action])
123 |         return action
124 | 
125 |     def _take_action(self, action_index):
126 |         action = self.actions[action_index]
127 |         if self.strategy_inject and self.turns > self.max_turns / 2:
128 |             random_index = random.randrange(start=0, stop=len(self.actions_taken) - 1, step=1)
129 |             self.actions_taken[random_index] = action_index
130 |             self.history[self.current_malware][random_index] = action
131 |         else:
132 |             self.actions_taken.append(action_index)
133 |             self.history[self.current_malware].append(action)
134 | 
135 |         # Check to avoid 'list out of index' exceptions
136 |         if self.turns <= self.max_turns / 2:
137 |             self.current_manipulation = f.rec_mod_files(input_bytes=self.original_bytez,
138 |                                                         actions=self.actions,
139 |                                                         chosen_actions=self.actions_taken,
140 |                                                         inject_perturbation=self.turns-1)
141 |         return f.readfile(self.current_manipulation)
142 | 
143 |     def reset(self):
144 |         self.turns = 0
145 |         self.actions_taken = []
146 |         self.current_malware = self._choose_next_malware()
147 |         self.used_malware.append(self.current_malware)
148 |         self.current_manipulation = self.current_malware  # For similarity
149 |         self.original_bytez = f.readfile(self.current_malware)
150 |         self.feature_space = self.feature_extractor.feature_vector(self.original_bytez)  # Observation space
151 |         self.history[self.current_malware] = []
152 | 
153 |         return np.asarray(self.feature_space)
154 | 
155 |     def reset_completely(self):
156 |         # Also reset used malware
157 |         self.history = OrderedDict()
158 |         self.reset()
159 |         self.used_malware = []
160 | 
161 |     # Randomly choose next malware, which has not been used
162 |     # If all available malware samples have already been used, list is resetting
163 |     def _choose_next_malware(self):
164 |         temp_list = [malware for malware in self.malware_list if self.used_malware.count(malware) == 0]
165 |         if len(temp_list) == 0:
166 |             temp_list = self.malware_list
167 |             self.used_malware = []
168 |         return random.choice(temp_list)
169 | 
170 |     # Reward = Detection (0/1) + Similarity (functions.get_difference) + Distance (self.turns)
171 |     def _calculate_reward(self):
172 |         max_reward = PARAM_DICT["maximum_reward"]  # AIMED-RL Paper: R_max = 10
173 | 
174 |         # ** Detected: Is the malware detected by the model? **
175 |         # Not detected: good, detected: bad
176 |         # Value: 0/10
177 |         detected, confidence = self.detector_function(self.current_manipulation)
178 |         # print("DETECTED FROM MODEL: ", detected, confidence)
179 |         detected_reward = 0
180 |         if not detected:
181 |             detected_reward = max_reward
182 | 
183 |         # ** Similarity: How much does the manipulation differ from the original file? **
184 |         # Difference ~40%: Best (empirical estimate)
185 |         # Value: 0-10
186 | 
187 |         # Gets back the difference between original and manipulated bytez, smaller value: more similar
188 |         difference = self.similarity_function(self.current_manipulation, self.current_malware)
189 |         original_length = len(self.original_bytez)
190 |         similarity_reward = self._calculate_similiarity_reward(difference, original_length)
191 | 
192 |         # ** Distance: How many actions have been taken? **
193 |         # More turns mean more actions mean a more diverse action vector
194 |         # Limit of 5 perturbations => Should be used completely
195 |         # Value: #actions * (max_reward/max_perturbations)
196 |         max_perturbations = PARAM_DICT["max_turns"] / 2 if PARAM_DICT["strategy_reset"] or PARAM_DICT["strategy_inject"] \
197 |             else PARAM_DICT["max_turns"]  # Applying a strategy halves the amount of possible perturbations
198 |         factor = max_reward / max_perturbations  # 2 in AIMED-RL Paper
199 |         distance_reward = len(self.actions_taken) * factor
200 | 
201 |         # Max value: max_reward
202 |         reward = detected_reward * self.detected_weight + similarity_reward * self.similarity_weight \
203 |                  + distance_reward * self.distance_weight
204 | 
205 |         if self.reward_punishment:
206 |             punishment = self._calculate_doubled_perturbation_punishment()
207 | 
208 |             if detected:  # Do not apply punishment if last perturbation lead to adversarial sample
209 |                 reward *= punishment
210 | 
211 |         return reward, detected, confidence
212 | 
213 |     # Calculate difference between best similarity (40%) and actual value and calculate reward accordingly
214 |     def _calculate_similiarity_reward(self, difference, original):
215 |         percent_sim = difference / original
216 |         percent_best = 0.4  # Empirical estimate: 40% difference between original file could be best for adversarial file
217 |         reward_sim = (1 - abs(percent_sim - percent_best)) * PARAM_DICT["maximum_reward"]  # AIMED_RL Paper: R_max = 10
218 |         return max(0, reward_sim)  # No negative reward
219 | 
220 |     # Give punishment to doubled perturbations in actions_taken
221 |     def _calculate_doubled_perturbation_punishment(self):
222 |         no_punishment = 1  # no reduction
223 |         punishment_doubled_once = 0.8  # 20% less reward
224 |         punishment_doubled_twice = 0.6  # 40% less reward
225 |         for action in self.actions_taken:
226 |             if self.actions_taken.count(action) > 2:
227 |                 return punishment_doubled_twice
228 |             if self.actions_taken.count(action) > 1:
229 |                 return punishment_doubled_once
230 | 
231 |         return no_punishment
232 | 
233 |     def render(self, mode='human', close=False):
234 |         if self.current_malware is not None and self.history[self.current_malware] is not None:
235 |             # print("Input object: " + str(self.current_malware))
236 |             if "RESET" in self.history[self.current_malware]:
237 |                 index_reset = self.history[self.current_malware].index("RESET")
238 |                 history_length = len(self.history[self.current_malware])
239 |                 # print("Actions (before reset): " + str(self.history[self.current_malware][0:index_reset]))
240 |                 print("Actions (after reset): " + str(
241 |                     self.history[self.current_malware][index_reset + 1:history_length]))
242 |             else:
243 |                 print("Actions: " + str(self.history[self.current_malware]))
244 |         else:
245 |             print("Environment has not been reset.")
246 | 
247 | 
248 | # The DQNSettings Enum summarizes the possible enhancements to the DQN Algorithm
249 | class DQNSettings(Enum):
250 |     REPLAY_BUFFER = 1
251 |     PRIORITIZED_REPLAY_BUFFER = 2
252 |     ADAM_OPTIMIZER = 3
253 |     LINEAR_DECAY_EPSILON_GREEDY = 4
254 |     BOLTZMANN_EXPLORATION = 5
255 |     NOISY_NETS = 6
256 |     ALGO_DQN = 7
257 |     ALGO_ACER = 8
258 |     ALGO_DISTDQN = 9
259 | 
260 | 
261 | # Reinforcement learning agent using chainer-rl library
262 | class RlAgent:
263 |     def __init__(self, environment: MalwareEnv):
264 |         self.env = environment
265 |         self.obs_size = len(environment.feature_space)
266 |         self.n_actions = environment.action_space.n
267 | 
268 |         if DQNSettings.ALGO_ACER.name in PARAM_DICT["agent"]:
269 |             self.agent = self.create_acer_agent()
270 |         else:
271 |             self.agent = self.create_dqn_agent()
272 | 
273 |     # For Algorithm Implementation see: https://github.com/endgameinc/gym-malware/blob/master/train_agent_chainer.py
274 |     # Rainbow: https://github.com/chainer/chainerrl/tree/master/examples/atari/reproduction/rainbow
275 |     def create_dqn_agent(self):
276 |         q_func = None
277 |         if DQNSettings.ALGO_DQN.name in PARAM_DICT["agent"]:
278 |             q_func = QFunction(self.obs_size, self.n_actions)
279 |         elif DQNSettings.ALGO_DISTDQN.name in PARAM_DICT["agent"]:
280 |             q_func = chainerrl.q_functions.DistributionalFCStateQFunctionWithDiscreteAction(
281 |                 ndim_obs=self.obs_size,
282 |                 n_actions=self.n_actions,
283 |                 n_atoms=51,  # See paper
284 |                 v_min=-10,  # See paper
285 |                 v_max=10,  # max reward
286 |                 n_hidden_layers=2,
287 |                 n_hidden_channels=64
288 |             )
289 |         assert q_func is not None
290 | 
291 |         optimizer = None
292 |         if DQNSettings.ADAM_OPTIMIZER.name in PARAM_DICT["optimizer"]:
293 |             optimizer = chainer.optimizers.Adam(eps=PARAM_DICT["adam_epsilon"])
294 |             optimizer.setup(q_func)
295 |         assert optimizer is not None
296 | 
297 |         explorer = None
298 |         if DQNSettings.LINEAR_DECAY_EPSILON_GREEDY.name in PARAM_DICT["explorer"]:
299 |             explorer = chainerrl.explorers. \
300 |                 LinearDecayEpsilonGreedy(start_epsilon=1.0,
301 |                                          end_epsilon=0.05,
302 |                                          decay_steps=100,
303 |                                          random_action_func=self.env.get_random_action)
304 |         elif DQNSettings.BOLTZMANN_EXPLORATION.name in PARAM_DICT["explorer"]:
305 |             explorer = chainerrl.explorers.Boltzmann(T=PARAM_DICT["boltzmann_temperature"])
306 |         elif DQNSettings.NOISY_NETS.name in PARAM_DICT["explorer"]:
307 |             links.to_factorized_noisy(q_func, sigma_scale=0.5)  # Sigma from chainerrl rainbow
308 |             explorer = chainerrl.explorers.Greedy()  # Turn off explorer (because of Noisy Nets)
309 |         assert explorer is not None
310 | 
311 |         replay_buffer = None
312 |         if DQNSettings.REPLAY_BUFFER.name in PARAM_DICT["replay_buffer"]:
313 |             replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=PARAM_DICT["replay_buffer_capacity"])
314 |         elif DQNSettings.PRIORITIZED_REPLAY_BUFFER.name in PARAM_DICT["replay_buffer"]:
315 |             betasteps = PARAM_DICT["max_turns"] * PARAM_DICT["episodes"]
316 |             replay_buffer = chainerrl.replay_buffer.PrioritizedReplayBuffer(
317 |                 capacity=PARAM_DICT["replay_buffer_capacity"],
318 |                 alpha=0.6,
319 |                 beta0=0.4,
320 |                 betasteps=betasteps,  # max_turns*episodes
321 |                 eps=0.01,
322 |                 normalize_by_max=True,
323 |                 error_min=0,
324 |                 error_max=1,
325 |                 num_steps=1)
326 |         assert replay_buffer is not None
327 | 
328 |         phi = lambda obs: obs.astype(np.float32, copy=False)
329 | 
330 |         agent = None
331 |         if DQNSettings.ALGO_DQN.name in PARAM_DICT["agent"]:
332 |             agent = chainerrl.agents.DoubleDQN(q_function=q_func,
333 |                                                optimizer=optimizer,
334 |                                                replay_buffer=replay_buffer,
335 |                                                explorer=explorer,
336 |                                                gamma=PARAM_DICT["dqn_gamma"],
337 |                                                replay_start_size=PARAM_DICT["replay_start_size"],
338 |                                                update_interval=PARAM_DICT["update_interval"],
339 |                                                target_update_interval=PARAM_DICT["target_update_interval"],
340 |                                                phi=phi)
341 |         elif DQNSettings.ALGO_DISTDQN.name in PARAM_DICT["agent"]:
342 |             agent = chainerrl.agents.CategoricalDoubleDQN(q_function=q_func,
343 |                                                           optimizer=optimizer,
344 |                                                           replay_buffer=replay_buffer,
345 |                                                           gamma=PARAM_DICT["dqn_gamma"],
346 |                                                           explorer=explorer,
347 |                                                           minibatch_size=PARAM_DICT["minibatch_size"],
348 |                                                           replay_start_size=PARAM_DICT["replay_start_size"],
349 |                                                           target_update_interval=PARAM_DICT["target_update_interval"],
350 |                                                           update_interval=PARAM_DICT["update_interval"],
351 |                                                           batch_accumulator=PARAM_DICT["batch_accumulator"],
352 |                                                           phi=phi,
353 |                                                           )
354 |         assert agent is not None
355 |         return agent
356 | 
357 |     # ACER agent was used in Gym Malware Environment. In this framework, however, it is not tested throughout
358 |     def create_acer_agent(self):
359 |         model = chainerrl.agents.acer.ACERSeparateModel(
360 |             pi=links.Sequence(
361 |                 L.Linear(self.obs_size, 1024, initialW=LeCunNormal(1e-3)),
362 |                 F.relu,
363 |                 L.Linear(1024, 512, initialW=LeCunNormal(1e-3)),
364 |                 F.relu,
365 |                 L.Linear(512, self.n_actions, initialW=LeCunNormal(1e-3)),
366 |                 SoftmaxDistribution),
367 |             q=links.Sequence(
368 |                 L.Linear(self.obs_size, 1024, initialW=LeCunNormal(1e-3)),
369 |                 F.relu,
370 |                 L.Linear(1024, 512, initialW=LeCunNormal(1e-3)),
371 |                 F.relu,
372 |                 L.Linear(512, self.n_actions, initialW=LeCunNormal(1e-3)),
373 |                 DiscreteActionValue),
374 |         )
375 | 
376 |         opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-2, alpha=0.99)
377 |         opt.setup(model)
378 |         opt.add_hook(chainer.optimizer.GradientClipping(40))
379 | 
380 |         replay_buffer = EpisodicReplayBuffer(128)
381 | 
382 |         phi = lambda obs: obs.astype(np.float32, copy=False)
383 | 
384 |         agent = chainerrl.agents.ACER(model, opt,
385 |                                       gamma=PARAM_DICT["dqn_gamma"],  # reward discount factor
386 |                                       t_max=32,  # update the model after this many local steps
387 |                                       replay_buffer=replay_buffer,
388 |                                       n_times_replay=4,  # number of times experience replay is repeated for each update
389 |                                       replay_start_size=64,
390 |                                       # don't start replay unless we have this many experiences in the buffer
391 |                                       disable_online_update=True,  # rely only on experience buffer
392 |                                       use_trust_region=True,  # enable trust region policy optimiztion
393 |                                       trust_region_delta=0.1,  # a parameter for TRPO
394 |                                       truncation_threshold=5.0,  # truncate large importance weights
395 |                                       beta=1e-2,  # entropy regularization parameter
396 |                                       phi=phi)
397 | 
398 |         return agent
399 | 
400 |     def make_action(self, state, reward, train=True):
401 |         if train:
402 |             return self.agent.act_and_train(state, reward)
403 | 
404 |         return self.agent.act(state)
405 | 
406 |     def stop_episode_and_train(self, state, reward, done):
407 |         self.agent.stop_episode_and_train(state, reward, done)
408 | 
409 |     def stop_episode(self):
410 |         self.agent.stop_episode()
411 | 
412 |     def save_existing_agent(self, directory_agent):
413 |         self.agent.save(directory_agent)
414 |         distutils.dir_util.copy_tree(directory_agent, PARAM_DICT['save_agent']+'last/')
415 | 
416 |     def print_debug(self):
417 |         print("RL AGENT: " + str(PARAM_DICT["name"]))
418 |         print("Statistics: ", self.agent.get_statistics())
419 | 
420 | 
421 | # See https://github.com/endgameinc/gym-malware/blob/master/train_agent_chainer.py
422 | class QFunction(chainer.Chain):
423 |     def __init__(self, obs_size, n_actions):
424 |         super(QFunction, self).__init__()
425 |         n_hidden_channels = PARAM_DICT["dqn_hidden_size"]
426 |         net = []
427 |         inp_dim = obs_size
428 |         for i, n_hid in enumerate(n_hidden_channels):
429 |             net += [('l{}'.format(i), L.Linear(inp_dim, n_hid))]
430 |             net += [('norm{}'.format(i), L.BatchNormalization(n_hid))]
431 |             net += [('_act{}'.format(i), F.relu)]
432 |             inp_dim = n_hid
433 | 
434 |         net += [('output', L.Linear(inp_dim, n_actions))]
435 | 
436 |         with self.init_scope():
437 |             for n in net:
438 |                 if not n[0].startswith('_'):
439 |                     setattr(self, n[0], n[1])
440 | 
441 |         self.forward = net
442 | 
443 |     def __call__(self, x, test=False):
444 |         """
445 |         Args:
446 |             x (ndarray or chainer.Variable): An observation
447 |             test (bool): a flag indicating whether it is in test mode
448 |         """
449 |         for n, f in self.forward:
450 |             if not n.startswith('_'):
451 |                 x = getattr(self, n)(x)
452 |             else:
453 |                 x = f(x)
454 | 
455 |         return chainerrl.action_value.DiscreteActionValue(x)
456 | 
457 | 
458 | class Logger:
459 |     """
460 |         Logger class to write data during training/evaluation to a csv file
461 |         It also creates a training or evaluation report at the end that summarizes the results.
462 |         The report also contains the current version of the PARAM_DICT to make the experiments reproducible
463 |     """
464 |     def __init__(self, directory_to_save, evaluate):
465 |         self.directory = directory_to_save
466 |         self.adversarial_samples = []
467 |         self.values_of_one_file = []
468 |         if evaluate:
469 |             self.data_file_name = PARAM_DICT["name"] + "_" + str(PARAM_DICT["threshold"]) + "_eval_data.csv"
470 |         else:
471 |             self.data_file_name = PARAM_DICT["name"] + "_train_data.csv"
472 | 
473 |     def reset_after_error(self):
474 |         self.values_of_one_file = []
475 | 
476 |     def log_turn_values(self, detection_value, reward, turn, episode, adversarial, actions_taken, malware):
477 |         self.values_of_one_file.append((detection_value, reward, turn, episode, adversarial, actions_taken, malware))
478 |         if adversarial:
479 |             self.adversarial_samples.append(
480 |                 (detection_value, reward, turn, episode, adversarial, actions_taken, malware))
481 | 
482 |     def write_sample_values_to_file(self):
483 |         # Initial create
484 |         if not os.path.isfile(self.directory + self.data_file_name):
485 |             data_report = open(self.directory + self.data_file_name, 'w')
486 |             data_report.write("detection_value,reward,turn,episode,adversarial,actions_taken,malware")
487 |             data_report.close()
488 | 
489 |         data_report = open(self.directory + self.data_file_name, 'a')
490 |         for detection_value, reward, turn, episode, adversarial, actions_taken, malware in self.values_of_one_file:
491 |             data_report.write("\n")
492 |             adver_value = "1" if adversarial else "0"
493 |             actions_string = str(actions_taken).replace("'", "").replace(",", ";")
494 |             report_string = str(detection_value) + "," + str(reward) + "," + str(turn) + "," + str(episode) + "," + \
495 |                             str(adver_value) + "," + actions_string + "," + str(malware).split("/")[-1]
496 |             data_report.write(report_string)
497 |         data_report.close()
498 |         self.values_of_one_file = []
499 | 
500 |     def save_agent_training_test_report(self, total_time, average_q, average_loss, agent_number_updates):
501 |         type_dict = PARAM_DICT.copy()
502 |         for key in type_dict:
503 |             type_dict[key] = type(PARAM_DICT[key])
504 |         with open(self.directory + str(PARAM_DICT["name"]) + "_training_report.csv", 'w') as agent_report:
505 |             w = csv.DictWriter(agent_report, PARAM_DICT.keys())
506 |             w.writeheader()
507 |             w.writerow(PARAM_DICT)
508 |             w.writerow(type_dict)
509 | 
510 |             pref_act_vector = self._calculate_most_often_used_action_vector()
511 | 
512 |             agent_report.write("\nAverage Q: " + str(average_q))
513 |             agent_report.write("\nAverage Loss: " + str(average_loss))
514 |             agent_report.write("\nNumber Updates Agent: " + str(agent_number_updates))
515 |             agent_report.write("\nPreferred Action Vector: " + str(pref_act_vector))
516 |             agent_report.write("\nTotal Time: " + str(total_time))
517 |             agent_report.write("\nNumber adversarial samples: " + str(len(self.adversarial_samples)))
518 |             agent_report.close()
519 | 
520 |             # Add a copy of the reports to the last/ dir
521 |             distutils.dir_util.copy_tree(self.directory, PARAM_DICT['save_report']+'training_reports/last/')
522 | 
523 |     def save_agent_evaluation_report(self, total_time, number_errored, average_q, average_loss, agent_number_updates):
524 |         if not os.path.isdir(PARAM_DICT["save_report"] + "evaluating_reports/"):
525 |             os.mkdir(PARAM_DICT["save_report"] + "evaluating_reports/")
526 |         with open(str(PARAM_DICT["save_report"] + "evaluating_reports/" + str(PARAM_DICT["name"]) + "_" + str(PARAM_DICT["threshold"]) + "_evaluation_report.csv"), 'w') as agent_report:
527 |             w = csv.DictWriter(agent_report, PARAM_DICT.keys())
528 |             w.writeheader()
529 |             w.writerow(PARAM_DICT)
530 | 
531 |             pref_act_vector = self._calculate_most_often_used_action_vector()
532 | 
533 |             agent_report.write("\nAverage Q: " + str(average_q))
534 |             agent_report.write("\nAverage Loss: " + str(average_loss))
535 |             agent_report.write("\nNumber Updates Agent: " + str(agent_number_updates))
536 |             agent_report.write("\nPreferred Action Vector: " + str(pref_act_vector))
537 |             agent_report.write("\nTotal Time: " + str(total_time))
538 |             agent_report.write("\nNumber adversarial samples: " + str(len(self.adversarial_samples)))
539 |             agent_report.write("\nNumber errored: " + str(number_errored))
540 |             agent_report.close()
541 | 
542 |     def _calculate_most_often_used_action_vector(self):
543 |         actions = [act for (v, re, t, r, adv, act, ma) in self.adversarial_samples]
544 |         if not actions:
545 |             return []
546 |         return max(actions, key=actions.count)
547 | 
548 | 
549 | def _create_env(malware_path, malware_detection_function, malware_analysis_function):
550 |     try:
551 |         samples = os.listdir(malware_path)
552 |         for i in range(len(samples)):
553 |             samples[i] = malware_path + samples[i]
554 |     except NotADirectoryError:
555 |         samples = [malware_path]  # Only test one sample
556 | 
557 |     env = MalwareEnv(malware_list=samples,
558 |                      detection_function=malware_detection_function,
559 |                      analysis_function=malware_analysis_function)
560 |     return env
561 | 
562 | 
563 | # Creates the directories where the training and evaluation data, as well as the agent is stored
564 | def _make_saving_directories():
565 |     if not os.path.isdir(PARAM_DICT["save_report"] + "training_reports/"):
566 |         os.mkdir(PARAM_DICT["save_report"] + "training_reports/")
567 |     date_and_time_now = str(datetime.now()).split(".")[0].replace(" ", "-").replace(":", "-")[0:-3]  # no seconds
568 | 
569 |     directory_logging = PARAM_DICT["save_report"] + "training_reports/" + date_and_time_now + "/"
570 |     os.makedirs(directory_logging)
571 |     directory_agent = PARAM_DICT["save_agent"] + date_and_time_now + "/"
572 |     os.makedirs(directory_agent)
573 |     return directory_logging, directory_agent
574 | 
575 | 
576 | # Creates a new agent and trains it with the current parameters from the PARAM_DICT
577 | def train_and_save_agent(malware_detection, malware_analysis):
578 |     directory_logging, directory_agent = _make_saving_directories()
579 |     malware_detection_function = lambda sample: malware_detection(sample=sample,
580 |                                                                   model=PARAM_DICT["detection_model"],
581 |                                                                   threshold=PARAM_DICT["threshold"])
582 |     # Environment:
583 |     env = _create_env(malware_path=PARAM_DICT["malware_path"],
584 |                       malware_detection_function=malware_detection_function,
585 |                       malware_analysis_function=malware_analysis)
586 |     state = env.reset()
587 |     env.render()
588 |     # Agent:
589 |     agent = RlAgent(environment=env)
590 |     # agent.print_debug()
591 | 
592 |     # Logger:
593 |     logger = Logger(directory_to_save=directory_logging,
594 |                     evaluate=False)
595 |     start_time = time()
596 | 
597 |     # TRAIN:
598 |     episodes = PARAM_DICT["episodes"]
599 |     episode = 1
600 |     while episode <= episodes:
601 |         print("\n### Training # Episode: {} of {} ###".format(episode, episodes))
602 |         current_turn = 0
603 |         reward, episode_over, info, errored = 0, False, {}, False
604 |         while not episode_over:
605 |             current_turn += 1
606 |             action = agent.make_action(state, reward, train=True)
607 |             print('\n## Turn: {} # Next action: {} ##'.format(current_turn, ACTIONS[action]))
608 |             state, reward, episode_over, info = env.step(action)
609 |             print("Reward in turn " + str(current_turn) + " : " + str(reward))
610 |             env.render()
611 | 
612 |             detected = info["detected"]
613 |             detection_value = info["detected_confidence"]
614 |             errored = info["errored"]  # LIEF ERRORS DURING MANIPULATION (True/False)
615 |             if not errored:
616 |                 logger.log_turn_values(detection_value=detection_value,
617 |                                        reward=reward,
618 |                                        turn=current_turn,
619 |                                        episode=episode,
620 |                                        adversarial=not detected,
621 |                                        actions_taken=_map_action_indices_to_actions(env.actions_taken),
622 |                                        malware=env.current_malware)
623 |             elif errored:
624 |                 episode -= 1  # Ignore this episode
625 |                 print('Episode ignored due to manipulation errors. Restarting..')
626 | 
627 |         if not errored:
628 |             agent.stop_episode_and_train(state, reward, episode_over)
629 |             logger.write_sample_values_to_file()
630 |         else:
631 |             agent.stop_episode()  # Do not train on errored malware
632 |             logger.reset_after_error()
633 | 
634 |         state = env.reset()
635 |         # print("Episode ended after " + str(current_turn) + " turns")
636 |         # print("Reward after episode: " + str(reward) + "\n")
637 | 
638 |         episode += 1
639 | 
640 |     print("Training finished!")
641 |     agent.save_existing_agent(directory_agent)
642 |     avg_q = agent.agent.get_statistics()[0][1]
643 |     avg_loss = agent.agent.get_statistics()[1][1]
644 |     number_updates = agent.agent.get_statistics()[2][1]
645 |     logger.save_agent_training_test_report(total_time=f.time_me(start_time),
646 |                                            average_q=avg_q,
647 |                                            average_loss=avg_loss,
648 |                                            agent_number_updates=number_updates)
649 |     return directory_logging
650 | 
651 | 
652 | # Loads the PARAM_DICT associated with an agent (in training report)
653 | def _load_agent_information(agent_information):
654 |     global PARAM_DICT # Check
655 |     # Load Dictionary:
656 |     with open(agent_information, 'r') as file:
657 |         r = csv.DictReader(file)
658 |         loaded_dicts = [dict(d) for d in r]
659 |         PARAM_DICT = loaded_dicts[0]
660 |         type_dict = loaded_dicts[1]
661 | 
662 |         for key in PARAM_DICT:
663 |             type_of_key_str = type_dict[key]
664 |             if "int" in type_of_key_str:
665 |                 type_of_key = int
666 |             elif "bool" in type_of_key_str:
667 |                 type_of_key = bool
668 |             elif "float" in type_of_key_str:
669 |                 type_of_key = float
670 |             elif "list" in type_of_key_str:
671 |                 type_of_key = list
672 |             else:
673 |                 type_of_key = None
674 |             if type_of_key is not None:
675 |                 if type_of_key == list:
676 |                     list_from_dict = str(PARAM_DICT[key]).replace("[", "").replace("]", "").split(",")
677 |                     map_to = int
678 |                     if "." in list_from_dict[0]:
679 |                         map_to = float
680 |                     PARAM_DICT[key] = list(map(map_to, list_from_dict))
681 |                 elif type_of_key == bool:
682 |                     PARAM_DICT[key] = True if "True" in PARAM_DICT[key] else False
683 |                 else:
684 |                     PARAM_DICT[key] = type_of_key(PARAM_DICT[key])
685 | 
686 | 
687 | # Evaluates a given agent against the model from the malware_detection function
688 | # Malware analysis can be optionally applied after the evaluation on successful adversarial examples
689 | def load_and_evaluate_agent(directory_agent, agent_information, evaluation_set_directory,
690 |                             malware_detection, malware_analysis):
691 |     _load_agent_information(agent_information=agent_information)
692 | 
693 |     malware_detection_function = lambda sample: malware_detection(sample=sample,
694 |                                                                   model=PARAM_DICT["detection_model"],
695 |                                                                   threshold=PARAM_DICT["threshold"])
696 |     # Env
697 |     env = _create_env(malware_path=evaluation_set_directory,
698 |                       malware_detection_function=malware_detection_function,
699 |                       malware_analysis_function=malware_analysis)
700 |     state = env.reset()
701 | 
702 |     # Agent
703 |     agent = RlAgent(environment=env)
704 |     agent.agent.load(directory_agent)
705 |     # agent.print_debug()
706 | 
707 |     # Logger:
708 |     logger = Logger(directory_to_save=directory_agent, evaluate=True)
709 |     start_time = time()
710 | 
711 |     # MANIPULATE:
712 |     episodes = len(env.malware_list)
713 |     episode = 1
714 |     number_errored = 0
715 |     while episode <= episodes:
716 |         print("\n### Evaluation # Episode: {} of {} ###".format(episode, episodes))
717 |         current_turn = 0
718 |         reward, episode_over, info, errored = 0, False, {}, False
719 |         while not episode_over:
720 |             current_turn += 1
721 |             action = agent.make_action(state, reward, train=False)
722 |             print('\n## Turn: {} # Next action: {} ##'.format(current_turn, ACTIONS[action]))
723 |             state, reward, episode_over, info = env.step(action)
724 |             env.render()
725 |             # print("Action from agent: " + ACTIONS[action])
726 |             print("Reward in turn " + str(current_turn) + " : " + str(reward))
727 | 
728 |             detected = info["detected"]
729 |             detection_value = info["detected_confidence"]
730 |             errored = info["errored"]  # LIEF ERRORS DURING MANIPULATION (True/False)
731 |             if not errored:
732 |                 logger.log_turn_values(detection_value=detection_value,
733 |                                        reward=reward,
734 |                                        turn=current_turn,
735 |                                        episode=episode,
736 |                                        adversarial=not detected,
737 |                                        actions_taken=_map_action_indices_to_actions(env.actions_taken),
738 |                                        malware=env.current_malware)
739 | 
740 |             # Episode does not get decreased (-= 1) like in training, rather we save how many errors we got
741 |             elif errored:
742 |                 number_errored += 1
743 | 
744 |         agent.stop_episode()
745 |         if not errored:
746 |             logger.write_sample_values_to_file()
747 |         else:
748 |             logger.reset_after_error()
749 | 
750 |         state = env.reset()
751 |         # print("Episode ended after " + str(current_turn) + " turns")
752 |         # print("Reward after episode: " + str(reward) + "\n")
753 | 
754 |         episode += 1
755 | 
756 |     print("\nNumber errored: ", number_errored)
757 |     print("Evaluation finished!")
758 |     avg_q = agent.agent.get_statistics()[0][1]
759 |     avg_loss = agent.agent.get_statistics()[1][1]
760 |     number_updates = agent.agent.get_statistics()[2][1]
761 |     logger.save_agent_evaluation_report(total_time=f.time_me(start_time),
762 |                                         number_errored=number_errored,
763 |                                         average_q=avg_q,
764 |                                         average_loss=avg_loss,
765 |                                         agent_number_updates=number_updates)
766 | 
767 | 
768 | def _map_action_indices_to_actions(actions_taken):
769 |     actions = []
770 |     for index in actions_taken:
771 |         actions.append(ACTIONS[index])
772 |     return actions
773 | 
774 | 
775 | PARAM_DICT = {
776 |     "name": "AIMEDRL",
777 |     "seed": 1234,
778 |     "save_report": "db/rl/",
779 |     "save_agent": "samples/rl/agent/",
780 |     "malware_path": "samples/malware_set/",
781 |     "episodes": 1000,
782 |     "detection_model": "LightGBM",
783 |     "threshold": 0.9,
784 |     "max_turns": 10,
785 |     "strategy_reset": True,
786 |     "strategy_inject": False,
787 |     "maximum_reward": 10,
788 |     "weights": STANDARD_WEIGHTS,
789 |     "reward_punishment": True,
790 |     "agent": DQNSettings.ALGO_DISTDQN.name,
791 |     "optimizer": DQNSettings.ADAM_OPTIMIZER.name,
792 |     "adam_epsilon": 1e-2,
793 |     "dqn_gamma": 0.95,
794 |     "dqn_replay_start_size": 32,
795 |     "replay_buffer": DQNSettings.PRIORITIZED_REPLAY_BUFFER.name,
796 |     "replay_buffer_capacity": 1000,
797 |     "dqn_hidden_size": [64, 16],
798 |     "explorer": DQNSettings.NOISY_NETS.name,
799 |     "epsilon_greedy_start_epsilon": 1.0,
800 |     "epsilon_greedy_end_epsilon": 0.05,
801 |     "epsilon_greedy_decay_steps": 100,
802 |     "boltzmann_temperature": 1.0,
803 |     "replay_start_size": 32,
804 |     "minibatch_size": 32,
805 |     "batch_accumulator": "mean",
806 |     "update_interval": 1,
807 |     "target_update_interval": 100
808 | }
809 | 


--------------------------------------------------------------------------------