├── .gitignore ├── LICENSE.md ├── README.md ├── __init__.py ├── dataset_tools ├── __init.py__.py ├── collect_infected_ips.py ├── discard_unuseful_datasets.py ├── download_datasets.py ├── infected_ips.json ├── label_mcfp_datasets.py ├── label_normal_datasets.py └── normal_ips.json ├── example_config.py ├── features_extraction ├── CertificateFeatures.py ├── ComputeFeatures.py ├── Connection4tuple.py ├── ConnectionFeatures.py ├── DNSConnection.py ├── DNSFeatures.py ├── DatasetInformation.py ├── ExtractFeatures.py ├── MainBro.py ├── __init.py__.py └── top_level_domain ├── logger.py ├── machine_learning ├── Get_normalize_data.py ├── __init.py__.py ├── features_selection.py ├── model.py ├── normalize_and_split.py └── train.py ├── main_tools.py ├── statistics ├── __init.py__.py ├── datasets_statistics.py ├── dns_features_statistics.py ├── dns_features_stats.ipynb └── models_stats.ipynb └── tools ├── __init.py__.py ├── backup_results.py ├── check_IP.py ├── delete_results.py ├── download_datasets_gdrive.sh ├── entropy.py ├── extract_bro_ciphers.py ├── generate_features_table.py ├── generate_results_table.py ├── split_alexa.py ├── timeFunction.py └── tls_finger.bro /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # PyCharm 109 | .idea 110 | 111 | # Config file 112 | config.py 113 | .DS_Store 114 | /results/graphs/ 115 | /results/logs/ 116 | /results/model/ 117 | /results/features/ 118 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright for portions of project BotnetDetectionThesis are held by František Střasák 2018 as part of project HTTPSDetector (https://github.com/frenky-strasak/HTTPSDetector). All other copyright for project BotnetDetectionThesis are held by lminy 2018. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BotnetDetectorThesis 2 | 3 | This implementation was realized for my master thesis on "Botnet detection in encrypted traffic - a machine learning approach" 4 | 5 | ## Configuration 6 | The configuration has to be done in config.py file. A template is provided in example_config.py 7 | 8 | 9 | ## Run 10 | 11 | Follow these steps: 12 | 1. run features_extraction/MainBro.py to extract the features in results/features.csv 13 | 2. run machine_learning/normalize_and_split.py to generate data to feed to ML 14 | 3. run train.py to generate models 15 | 16 | ## Choosing the set of features to train 17 | 18 | Pass the setname of the features to use through 19 | ```Python 20 | Get_normalize_data.get_all_data("model_folder", "set_name") 21 | ``` 22 | setname can take the value "all", "dns", "https", "reduced", "reduced_30", "reduced_40" and "enhanced_30". 23 | To create a new set of features, just complete the *features_set* dictionnary present in the *get_all_data(...)* function 24 | 25 | ## Generate the enhanced features set 26 | The enhanced features set contains cipher suites from ClientHello packets. 27 | Unfortunately the information is not available by default in Bro logs. 28 | Therefore it is required to extract them by hand. The tls_finger.bro script from [securityartwork.es](https://www.securityartwork.es/2017/02/02/tls-client-fingerprinting-with-bro/) has been used in order to do this extraction 29 | Moreover, to avoid re-computing the whole features set (which is time and ressources consuming), 30 | the features are calculated separately then added to the csv with all features. 31 | 32 | Here are the steps to generate the enhanced features set: 33 | 34 | 1. Install [Bro](https://www.bro.org/download/index.html) or install [SecurityOnion](https://securityonion.net/) and put the **tls_finger.bro** file into the folder **"/usr/local/share/bro/site"** 35 | 2. Use **extract_bro_ciphers.py** to extract cipher suites from Bro logs 36 | 3. Use **feature_extraction/compute_ciphersuites_features.ipynb** to compute the features from Bro logs and store them in **results/model/features_enhanced.csv** 37 | 38 | 39 | ## Project structure 40 | - **dataset_tools/** -> contains all the tools related to the datasets (download, collect infected IPs, label and discard datasets) 41 | - **download_datasets.py**: to download the desired datasets 42 | - **discard_unuseful_datasets.py**: to discard datasets that have no flows labelled 43 | - **collect_infected_ips.py**: to collect infected and normal IPs from README.html files present in the dataset folders (uses a regex to parse the files) 44 | - **label_normal_datasets.py**: to label normal datasets 45 | - **label_mcfp_datasets.py**: to label MFCP datasets (excluding the "CTU-13 Dataset" which is already labelled) 46 | - **features_extraction/** -> contains the scripts that extract the features. Credits go to [Frantisek Strasak](https://github.com/frenky-strasak) for HTTPS features extractions. 47 | - **machine_learning/** -> contains the scripts to normalize the data from the features extracted and train the model 48 | - **results/{graphs|logs|model}** -> default folders for generated graphs, models and logs 49 | - **results_backup/** -> contains the backup results of the different experiments 50 | - **statistics/** -> contains the scripts to analyze the features extracted and the models generated 51 | - **tools/** -> Various tools: 52 | - **tls_finger.bro**: Bro script to extract cipher suites 53 | - **extract_bro_ciphers.py**: Python script to extract logs + cipher suites from pcap's 54 | - **backup_results.py**: to backup the result folder (requires "results_folder_backup" to be set in config file) 55 | - **delete_results.py**: to delete the result folder 56 | - **split_alexa.py**: to sort and split alexa top websites in multiple files for quicker lookups 57 | 58 | ## Main requirements 59 | - [Python 2.7](https://www.python.org/download/releases/2.7/) 60 | - [Jupyter notebook](https://jupyter.org/install) 61 | - [Numpy](http://www.numpy.org/) 62 | - [SciPy](https://www.scipy.org/install.html) 63 | - [sklearn](http://scikit-learn.org/stable/install.html) 64 | - [XGBoost](https://github.com/dmlc/xgboost/tree/master/python-package) 65 | 66 | 67 | ## License 68 | BotnetDetectorThesis is released under the MIT license. Credits go to František Střasák for some parts of the code (https://github.com/frenky-strasak/HTTPSDetector). -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lminy/BotnetDetectionThesis/5a54541229a6d7255f0eebe65aaf8b1c35b9be04/__init__.py -------------------------------------------------------------------------------- /dataset_tools/__init.py__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lminy/BotnetDetectionThesis/5a54541229a6d7255f0eebe65aaf8b1c35b9be04/dataset_tools/__init.py__.py -------------------------------------------------------------------------------- /dataset_tools/collect_infected_ips.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import json 4 | import re 5 | import config as c 6 | 7 | 8 | def run(cmd): 9 | import subprocess 10 | return subprocess.check_output(cmd) 11 | 12 | infected_ips = dict() 13 | normal_ips = dict() 14 | 15 | 16 | def print_ips(dataset_name=None): 17 | if dataset_name is None: 18 | print "Infected IPs : " + json.dumps(infected_ips) 19 | print "Normal IPs : " + json.dumps(normal_ips) 20 | else: 21 | print "Infected IPs : " + json.dumps(infected_ips[dataset_name]) 22 | print "Normal IPs : " + json.dumps(normal_ips[dataset_name]) 23 | 24 | 25 | # Loads json files 26 | if os.path.exists("./infected_ips.json") and os.path.exists("./normal_ips.json"): 27 | with open('./infected_ips.json', 'r') as f: 28 | infected_ips = json.load(f) 29 | 30 | with open('./normal_ips.json', 'r') as f: 31 | normal_ips = json.load(f) 32 | 33 | print_ips() 34 | 35 | 36 | infected_ips_collected_by_hand = { 37 | "CTU-Malware-Capture-Botnet-25-1":["10.0.2.106"], 38 | "CTU-Malware-Capture-Botnet-25-2":["10.0.2.103"], 39 | "CTU-Malware-Capture-Botnet-25-3":["10.0.2.103"], 40 | "CTU-Malware-Capture-Botnet-25-4":["10.0.2.103"], 41 | "CTU-Malware-Capture-Botnet-25-5":["10.0.2.103"], 42 | "CTU-Malware-Capture-Botnet-25-6":["10.0.2.103"], 43 | "CTU-Malware-Capture-Botnet-31-1":["10.0.2.110"], 44 | "CTU-Malware-Capture-Botnet-69":["10.0.2.117"], 45 | "CTU-Malware-Capture-Botnet-78-2":["10.0.2.108"], 46 | "CTU-Malware-Capture-Botnet-78-1":["10.0.2.108"], 47 | "CTU-Malware-Capture-Botnet-83-1":["10.0.2.102"], 48 | "CTU-Malware-Capture-Botnet-83-2":["10.0.2.102"], 49 | "CTU-Malware-Capture-Botnet-90":["192.168.3.104"], 50 | "CTU-Malware-Capture-Botnet-261-4":['192.168.1.'+str(i) for i in range(0,256)], 51 | "CTU-Malware-Capture-Botnet-301-1":['192.168.1.'+str(i) for i in range(0,256)], 52 | "CTU-Malware-Capture-Botnet-321-1":['192.168.1.'+str(i) for i in range(0,256)], 53 | 54 | } 55 | 56 | infected_ips.update(infected_ips_collected_by_hand) 57 | 58 | with open('./infected_ips.json', 'w') as f: 59 | f.write(json.dumps(infected_ips)) 60 | 61 | 62 | normal_ips_collected_by_hand = { 63 | "CTU-Malware-Capture-Botnet-25-1":[""], 64 | "CTU-Malware-Capture-Botnet-25-2":[""], 65 | "CTU-Malware-Capture-Botnet-25-3":[""], 66 | "CTU-Malware-Capture-Botnet-25-4":[""], 67 | "CTU-Malware-Capture-Botnet-25-5":[""], 68 | "CTU-Malware-Capture-Botnet-25-6":[""], 69 | "CTU-Malware-Capture-Botnet-31-1":[""], 70 | "CTU-Malware-Capture-Botnet-69":[""], 71 | "CTU-Malware-Capture-Botnet-78-1":[""], 72 | "CTU-Malware-Capture-Botnet-78-2":[""], 73 | "CTU-Malware-Capture-Botnet-83-1":[""], 74 | "CTU-Malware-Capture-Botnet-83-2":[""], 75 | "CTU-Malware-Capture-Botnet-90":[""], 76 | "CTU-Malware-Capture-Botnet-261-4":[""], 77 | "CTU-Malware-Capture-Botnet-301-1":[""], 78 | "CTU-Malware-Capture-Botnet-321-1":[""], 79 | } 80 | 81 | normal_ips.update(normal_ips_collected_by_hand) 82 | 83 | with open('./normal_ips.json', 'w') as f: 84 | f.write(json.dumps(normal_ips)) 85 | 86 | 87 | index = 0 88 | for sub_set in os.listdir(c.datasets_folder_general): 89 | if sub_set.startswith(".") or not os.path.exists(c.datasets_folder_general + sub_set + '/bro/ssl.log'): 90 | continue 91 | 92 | dataset_folder = c.datasets_folder_general + sub_set 93 | 94 | index += 1 95 | 96 | dataset_number = int(sub_set.split('-')[4]) 97 | if sub_set.startswith("CTU-Malware-Capture-Botnet-") and (dataset_number <= 42 or dataset_number >= 54): 98 | print "========================================================" 99 | print "======== #" + str(index) + " " + sub_set 100 | print "========================================================" 101 | if sub_set in infected_ips: 102 | print "Already checked! :)" 103 | print_ips(sub_set) 104 | continue 105 | 106 | #print os.listdir(dataset_folder) 107 | for filename in os.listdir(dataset_folder): 108 | if "README.html" in filename: 109 | ips = list() 110 | 111 | with open(dataset_folder + "/" + filename) as f: 112 | for line in f: 113 | matchObj = re.match('.*Infected host: (\d+\.\d+\.\d+\.\d+).*', line) 114 | 115 | if matchObj: 116 | ips.append(matchObj.group(1)) 117 | 118 | if len(ips) > 0: 119 | print "IPs Found : " + str(ips) 120 | infected_ips[sub_set] = ips 121 | with open('./infected_ips.json', 'w') as f: 122 | f.write(json.dumps(infected_ips)) 123 | normal_ips[sub_set] = [""] 124 | with open('./normal_ips.json', 'w') as f: 125 | f.write(json.dumps(normal_ips)) 126 | else: 127 | print "No match!!" 128 | print "------------------------------------" 129 | print "---------- Infected hosts" 130 | #print run(["grep", "-i", "-C", "3", "Infected", dataset_folder + "/" + filename]) 131 | ips = str(raw_input()) 132 | infected_ips[sub_set] = ips.split(",") 133 | with open('./infected_ips.json', 'w') as f: 134 | f.write(json.dumps(infected_ips)) 135 | 136 | print "------------------------------------" 137 | print "---------- Normal hosts" 138 | #print run(["grep", "-i", "-C", "3", 'Normal', dataset_folder + "/" + filename]) 139 | ips = str(raw_input()) 140 | normal_ips[sub_set] = ips.split(",") 141 | with open('./normal_ips.json', 'w') as f: 142 | f.write(json.dumps(normal_ips)) 143 | break 144 | 145 | 146 | -------------------------------------------------------------------------------- /dataset_tools/discard_unuseful_datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import json 4 | import config as c 5 | 6 | with open('./infected_ips.json', 'r') as f: 7 | infected_ips = json.load(f) 8 | 9 | with open('./normal_ips.json', 'r') as f: 10 | normal_ips = json.load(f) 11 | 12 | index = 0 13 | for sub_set in os.listdir(c.datasets_folder_general): 14 | if sub_set.startswith(".") or not os.path.exists(datasets_folder + sub_set + '/bro/ssl.log'): 15 | continue 16 | 17 | dataset_folder = c.datasets_folder_general + sub_set 18 | 19 | index += 1 20 | 21 | dataset_number = int(sub_set.split('-')[4]) 22 | if sub_set.startswith("CTU-Malware-Capture-Botnet-") and (dataset_number <= 42 or dataset_number >= 54): 23 | print("========================================================") 24 | print("======== #" + str(index) + " " + sub_set) 25 | print("========================================================") 26 | if len(infected_ips[sub_set][0]) == 0 and \ 27 | len(normal_ips[sub_set][0]) == 0: 28 | print("Moving dataset {} ({}) to {}".format(sub_set, dataset_folder, folder_other_datasets)) 29 | shutil.move(dataset_folder, c.datasets_discarded_folder) -------------------------------------------------------------------------------- /dataset_tools/download_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download all datasets which have bro folder. 3 | USAGE: 4 | python download_datasets.py https://mcfp.felk.cvut.cz/publicDatasets/ 5 | """ 6 | 7 | import sys 8 | from bs4 import BeautifulSoup 9 | import requests 10 | import requests.packages.urllib3.exceptions 11 | requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) 12 | import urllib2 13 | import ssl 14 | import os 15 | import shutil 16 | import config as c 17 | import time 18 | import datetime 19 | from logger import get_logger 20 | 21 | 22 | import logging 23 | #import config as c 24 | 25 | 26 | logger = get_logger('debug') 27 | 28 | 29 | files_to_download = ["ssl.log", "x509.log", "weird.log", "conn.log", "dns.log"] 30 | 31 | # Normal datasets 32 | datasets_to_download = [ 33 | 'CTU-Normal-20/', 34 | 'CTU-Normal-21/', 35 | 'CTU-Normal-22/', 36 | 'CTU-Normal-23/', 37 | 'CTU-Normal-24/', 38 | 'CTU-Normal-25/', 39 | 'CTU-Normal-26/', 40 | 'CTU-Normal-27/', 41 | 'CTU-Normal-28/', 42 | 'CTU-Normal-29/', 43 | 'CTU-Normal-30/', 44 | 'CTU-Normal-31/', 45 | 'CTU-Normal-32/' 46 | ] 47 | 48 | # THE CTU-13 DATASET 49 | datasets_to_download = [ 50 | 'CTU-Malware-Capture-Botnet-42/', 51 | 'CTU-Malware-Capture-Botnet-43/', 52 | 'CTU-Malware-Capture-Botnet-44/', 53 | 'CTU-Malware-Capture-Botnet-45/', 54 | 'CTU-Malware-Capture-Botnet-46/', 55 | 'CTU-Malware-Capture-Botnet-47/', 56 | 'CTU-Malware-Capture-Botnet-48/', 57 | 'CTU-Malware-Capture-Botnet-49/', 58 | 'CTU-Malware-Capture-Botnet-50/', 59 | 'CTU-Malware-Capture-Botnet-51/', 60 | 'CTU-Malware-Capture-Botnet-52/', 61 | 'CTU-Malware-Capture-Botnet-53/', 62 | 'CTU-Malware-Capture-Botnet-54/', 63 | ] 64 | 65 | # Whole datasets 66 | datasets_to_download = ["CTU-Malware-Capture-Botnet-1", "CTU-Malware-Capture-Botnet-102", "CTU-Malware-Capture-Botnet-111-1", "CTU-Malware-Capture-Botnet-116-1", "CTU-Malware-Capture-Botnet-116-2", "CTU-Malware-Capture-Botnet-138-1", "CTU-Malware-Capture-Botnet-157-1", "CTU-Malware-Capture-Botnet-163-1", "CTU-Malware-Capture-Botnet-164-1", "CTU-Malware-Capture-Botnet-169-1", "CTU-Malware-Capture-Botnet-169-2", "CTU-Malware-Capture-Botnet-169-3", "CTU-Malware-Capture-Botnet-17-1", "CTU-Malware-Capture-Botnet-17-2", "CTU-Malware-Capture-Botnet-174-1", "CTU-Malware-Capture-Botnet-175-1", "CTU-Malware-Capture-Botnet-177-1", "CTU-Malware-Capture-Botnet-178-1", "CTU-Malware-Capture-Botnet-179-1", "CTU-Malware-Capture-Botnet-180-1", "CTU-Malware-Capture-Botnet-181-1", "CTU-Malware-Capture-Botnet-183-1", "CTU-Malware-Capture-Botnet-184-1", "CTU-Malware-Capture-Botnet-185-1", "CTU-Malware-Capture-Botnet-186-1", "CTU-Malware-Capture-Botnet-187-1", "CTU-Malware-Capture-Botnet-188-1", "CTU-Malware-Capture-Botnet-188-2", "CTU-Malware-Capture-Botnet-188-3", "CTU-Malware-Capture-Botnet-188-4", "CTU-Malware-Capture-Botnet-189-1", "CTU-Malware-Capture-Botnet-189-2", "CTU-Malware-Capture-Botnet-193-1", "CTU-Malware-Capture-Botnet-193-2", "CTU-Malware-Capture-Botnet-194-1", "CTU-Malware-Capture-Botnet-195-1", "CTU-Malware-Capture-Botnet-196-1", "CTU-Malware-Capture-Botnet-198-1", "CTU-Malware-Capture-Botnet-199-1", "CTU-Malware-Capture-Botnet-199-2", "CTU-Malware-Capture-Botnet-200-1", "CTU-Malware-Capture-Botnet-201-1", "CTU-Malware-Capture-Botnet-202-1", "CTU-Malware-Capture-Botnet-203-1", "CTU-Malware-Capture-Botnet-204-1", "CTU-Malware-Capture-Botnet-205-1", "CTU-Malware-Capture-Botnet-205-2", "CTU-Malware-Capture-Botnet-208-2", "CTU-Malware-Capture-Botnet-209-1", "CTU-Malware-Capture-Botnet-210-1", "CTU-Malware-Capture-Botnet-211-1", "CTU-Malware-Capture-Botnet-211-2", "CTU-Malware-Capture-Botnet-213-1", "CTU-Malware-Capture-Botnet-215-1", "CTU-Malware-Capture-Botnet-215-2", "CTU-Malware-Capture-Botnet-217-1", "CTU-Malware-Capture-Botnet-218-1", "CTU-Malware-Capture-Botnet-219-1", "CTU-Malware-Capture-Botnet-219-2", "CTU-Malware-Capture-Botnet-219-3", "CTU-Malware-Capture-Botnet-220-1", "CTU-Malware-Capture-Botnet-221-1", "CTU-Malware-Capture-Botnet-221-2", "CTU-Malware-Capture-Botnet-222-1", "CTU-Malware-Capture-Botnet-224-1", "CTU-Malware-Capture-Botnet-227-1", "CTU-Malware-Capture-Botnet-228-1", "CTU-Malware-Capture-Botnet-230-1", "CTU-Malware-Capture-Botnet-230-2", "CTU-Malware-Capture-Botnet-231-1", "CTU-Malware-Capture-Botnet-232-1", "CTU-Malware-Capture-Botnet-235-1", "CTU-Malware-Capture-Botnet-237-1", "CTU-Malware-Capture-Botnet-238-1", "CTU-Malware-Capture-Botnet-239-1", "CTU-Malware-Capture-Botnet-240-1", "CTU-Malware-Capture-Botnet-241-1", "CTU-Malware-Capture-Botnet-242-1", "CTU-Malware-Capture-Botnet-243-1", "CTU-Malware-Capture-Botnet-244-1", "CTU-Malware-Capture-Botnet-245-1", "CTU-Malware-Capture-Botnet-246-1", "CTU-Malware-Capture-Botnet-247-1", "CTU-Malware-Capture-Botnet-248-1", "CTU-Malware-Capture-Botnet-249-1", "CTU-Malware-Capture-Botnet-25-1", "CTU-Malware-Capture-Botnet-25-2", "CTU-Malware-Capture-Botnet-25-3", "CTU-Malware-Capture-Botnet-25-4", "CTU-Malware-Capture-Botnet-25-5", "CTU-Malware-Capture-Botnet-25-6", "CTU-Malware-Capture-Botnet-251-1", "CTU-Malware-Capture-Botnet-253-1", "CTU-Malware-Capture-Botnet-254-1", "CTU-Malware-Capture-Botnet-257-1", "CTU-Malware-Capture-Botnet-260-1", "CTU-Malware-Capture-Botnet-261-1", "CTU-Malware-Capture-Botnet-261-2", "CTU-Malware-Capture-Botnet-261-3", "CTU-Malware-Capture-Botnet-261-4", "CTU-Malware-Capture-Botnet-263-1", "CTU-Malware-Capture-Botnet-264-1", "CTU-Malware-Capture-Botnet-265-1", "CTU-Malware-Capture-Botnet-266-1", "CTU-Malware-Capture-Botnet-267-1", "CTU-Malware-Capture-Botnet-270-1", "CTU-Malware-Capture-Botnet-273-1", "CTU-Malware-Capture-Botnet-274-1", "CTU-Malware-Capture-Botnet-275-1", "CTU-Malware-Capture-Botnet-277-1", "CTU-Malware-Capture-Botnet-278-1", "CTU-Malware-Capture-Botnet-279-1", "CTU-Malware-Capture-Botnet-280-1", "CTU-Malware-Capture-Botnet-281-1", "CTU-Malware-Capture-Botnet-282-1", "CTU-Malware-Capture-Botnet-285-1", "CTU-Malware-Capture-Botnet-287-1", "CTU-Malware-Capture-Botnet-290-1", "CTU-Malware-Capture-Botnet-291-1", "CTU-Malware-Capture-Botnet-292-1", "CTU-Malware-Capture-Botnet-293-1", "CTU-Malware-Capture-Botnet-294-1", "CTU-Malware-Capture-Botnet-295-1", "CTU-Malware-Capture-Botnet-296-1", "CTU-Malware-Capture-Botnet-297-1", "CTU-Malware-Capture-Botnet-299-1", "CTU-Malware-Capture-Botnet-300-1", "CTU-Malware-Capture-Botnet-301-1", "CTU-Malware-Capture-Botnet-302-1", "CTU-Malware-Capture-Botnet-303-1", "CTU-Malware-Capture-Botnet-305-1", "CTU-Malware-Capture-Botnet-305-2", "CTU-Malware-Capture-Botnet-306-1", "CTU-Malware-Capture-Botnet-308-1", "CTU-Malware-Capture-Botnet-31-1", "CTU-Malware-Capture-Botnet-315-1", "CTU-Malware-Capture-Botnet-318-1", "CTU-Malware-Capture-Botnet-320-1", "CTU-Malware-Capture-Botnet-320-2", "CTU-Malware-Capture-Botnet-321-1", "CTU-Malware-Capture-Botnet-322-1", "CTU-Malware-Capture-Botnet-323-1", "CTU-Malware-Capture-Botnet-324-1", "CTU-Malware-Capture-Botnet-325-1", "CTU-Malware-Capture-Botnet-326-1", "CTU-Malware-Capture-Botnet-327-1", "CTU-Malware-Capture-Botnet-327-2", "CTU-Malware-Capture-Botnet-328-1", "CTU-Malware-Capture-Botnet-329-1", "CTU-Malware-Capture-Botnet-334-1", "CTU-Malware-Capture-Botnet-335-1", "CTU-Malware-Capture-Botnet-336-1", "CTU-Malware-Capture-Botnet-339-1", "CTU-Malware-Capture-Botnet-340-1", "CTU-Malware-Capture-Botnet-341-1", "CTU-Malware-Capture-Botnet-344-1", "CTU-Malware-Capture-Botnet-345-1", "CTU-Malware-Capture-Botnet-346-1", "CTU-Malware-Capture-Botnet-348-1", "CTU-Malware-Capture-Botnet-349-1", "CTU-Malware-Capture-Botnet-350-1", "CTU-Malware-Capture-Botnet-352-1", "CTU-Malware-Capture-Botnet-354-1", "CTU-Malware-Capture-Botnet-42", "CTU-Malware-Capture-Botnet-43", "CTU-Malware-Capture-Botnet-44", "CTU-Malware-Capture-Botnet-45", "CTU-Malware-Capture-Botnet-46", "CTU-Malware-Capture-Botnet-47", "CTU-Malware-Capture-Botnet-48", "CTU-Malware-Capture-Botnet-49", "CTU-Malware-Capture-Botnet-50", "CTU-Malware-Capture-Botnet-51", "CTU-Malware-Capture-Botnet-52", "CTU-Malware-Capture-Botnet-53", "CTU-Malware-Capture-Botnet-54", "CTU-Malware-Capture-Botnet-69", "CTU-Malware-Capture-Botnet-78-1", "CTU-Malware-Capture-Botnet-78-2", "CTU-Malware-Capture-Botnet-83-1", "CTU-Malware-Capture-Botnet-83-2", "CTU-Malware-Capture-Botnet-90", "CTU-Normal-12", "CTU-Normal-20", "CTU-Normal-21", "CTU-Normal-22", "CTU-Normal-23", "CTU-Normal-24", "CTU-Normal-25", "CTU-Normal-26", "CTU-Normal-27" "CTU-Normal-28", "CTU-Normal-29", "CTU-Normal-30", "CTU-Normal-31", "CTU-Normal-32", "CTU-Normal-6-filtered", "CTU-Normal-7", "CTU-Normal-8-1", "CTU-Normal-8-2", "CTU-Normal-9"] 67 | 68 | 69 | def find_files(url): 70 | soup = BeautifulSoup(requests.get(url, verify=False).text, "lxml") 71 | hrefs = [] 72 | for a in soup.find_all('a'): 73 | if 'href' in a.attrs : 74 | hrefs.append(a['href']) 75 | return hrefs 76 | 77 | 78 | def compute_datasets_size(url): 79 | dataset_names = find_files(url) 80 | file_sizes = 0 81 | for i in range(len(dataset_names)): 82 | if dataset_names[i].replace("/", "") in datasets_to_download: 83 | #if 'CTU-Malware-Capture-Botnet-' in dataset_names[i] or 'CTU-Normal-' in dataset_names[i]: 84 | #number_name = int(dataset_names[i].split('-')[4].replace('/', '')) 85 | 86 | #if number_name < 248: 87 | # continue 88 | 89 | logger.info(url + dataset_names[i]) 90 | 91 | # Get content of the main page of dataset. 92 | content = find_files(url + dataset_names[i]) 93 | 94 | # Look into open folder to files there. There are binetflow, bro, ... 95 | # And find the bro folder in this list. 96 | for j in range(len(content)): 97 | if 'bro' in content[j]: 98 | #print dataset_names[i] + content[j] 99 | file_sizes += save_manager(url, dataset_names[i]) 100 | break 101 | 102 | return file_sizes 103 | 104 | 105 | def save_manager(url, dataset_name): 106 | file_sizes = 0 107 | bro_files = find_files(url + dataset_name + 'bro/') 108 | 109 | if 'ssl.log' in bro_files: 110 | directory_name = c.datasets_folder_general + dataset_name 111 | #if os.path.exists(directory_name): 112 | # shutil.rmtree(directory_name) 113 | 114 | if not os.path.exists(directory_name): 115 | os.makedirs(directory_name) 116 | 117 | 118 | url_dataset = url + dataset_name 119 | for filename in find_files(url_dataset): 120 | # Download Readme file 121 | if "README" in filename and not os.path.exists(directory_name + filename): 122 | save_file(url_dataset + filename, directory_name + filename) 123 | # Download pcap file 124 | if filename.endswith(".pcap") and not os.path.exists(directory_name + filename): 125 | save_file(url_dataset + filename, directory_name + filename) 126 | #url_file = url + dataset_name + "README.html" 127 | #file_name = directory_name + "README.html" 128 | 129 | 130 | 131 | 132 | 133 | folder_bro = directory_name + "bro/" 134 | if not os.path.exists(folder_bro): 135 | os.makedirs(folder_bro) 136 | 137 | for bro_log in bro_files: 138 | if bro_log.endswith('.log') and bro_log in files_to_download: 139 | if not os.path.exists(directory_name + "bro/" + bro_log): # If file does not exists on hdd 140 | logger.info(url + dataset_name) 141 | url_file = url + dataset_name + 'bro/' + bro_log 142 | file_sizes += save_file(url_file, folder_bro + bro_log) 143 | 144 | return file_sizes 145 | 146 | 147 | def save_file(url_file, file_name): 148 | logger.info(url_file + " is downloading...") 149 | file_size = 0 150 | # https://stackoverflow.com/a/28052583 151 | req = urllib2.Request(url, headers={ 'X-Mashape-Key': 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' }) 152 | gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1) 153 | u = urllib2.urlopen(url_file, context=gcontext) 154 | meta = u.info() 155 | file_size += int(meta.getheaders("Content-Length")[0]) 156 | 157 | f = open(file_name, 'wb') 158 | #logger.info("Downloading: %s Bytes: %s" % (file_name, file_size)) 159 | 160 | file_size_dl = 0 161 | block_sz = 8192 162 | while True: 163 | buffer = u.read(block_sz) 164 | if not buffer: 165 | break 166 | 167 | file_size_dl += len(buffer) 168 | f.write(buffer) 169 | status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size) 170 | status = status + chr(8) * (len(status) + 1) 171 | #logger.info(status) 172 | 173 | f.close() 174 | return file_size 175 | 176 | 177 | if __name__ == '__main__': 178 | start_time = time.time() 179 | datasets_size = 0 180 | if len(sys.argv) == 2: 181 | url = sys.argv[1] 182 | datasets_size += compute_datasets_size(url) 183 | # find_files(url+'CTU-Malware-Capture-Botnet-31/') 184 | else: 185 | logger.error("Error: Please put argument.") 186 | logger.info("Complet Dataset size:" + str(datasets_size / (1024.0 * 1024.0)) + "MB") 187 | total_time = datetime.timedelta(seconds=time.time() - start_time) 188 | logger.info("Time : " + str(total_time)) # .strftime('%H:%M:%S')) 189 | 190 | -------------------------------------------------------------------------------- /dataset_tools/infected_ips.json: -------------------------------------------------------------------------------- 1 | {"CTU-Malware-Capture-Botnet-116-4": [""], "CTU-Malware-Capture-Botnet-336-1": ["192.168.1.120"], "CTU-Malware-Capture-Botnet-116-2": ["192.168.0.250,192.168.0.251"], "CTU-Malware-Capture-Botnet-221-2": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-221-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-116-1": [""], "CTU-Malware-Capture-Botnet-117-1": [""], "CTU-Malware-Capture-Botnet-322-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-335-1": ["192.168.1.118"], "CTU-Malware-Capture-Botnet-227-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-208-2": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-321-1": ["192.168.1.0", "192.168.1.1", "192.168.1.2", "192.168.1.3", "192.168.1.4", "192.168.1.5", "192.168.1.6", "192.168.1.7", "192.168.1.8", "192.168.1.9", "192.168.1.10", "192.168.1.11", "192.168.1.12", "192.168.1.13", "192.168.1.14", "192.168.1.15", "192.168.1.16", "192.168.1.17", "192.168.1.18", "192.168.1.19", "192.168.1.20", "192.168.1.21", "192.168.1.22", "192.168.1.23", "192.168.1.24", "192.168.1.25", "192.168.1.26", "192.168.1.27", "192.168.1.28", "192.168.1.29", "192.168.1.30", "192.168.1.31", "192.168.1.32", "192.168.1.33", "192.168.1.34", "192.168.1.35", "192.168.1.36", "192.168.1.37", "192.168.1.38", "192.168.1.39", "192.168.1.40", "192.168.1.41", "192.168.1.42", "192.168.1.43", "192.168.1.44", "192.168.1.45", "192.168.1.46", "192.168.1.47", "192.168.1.48", "192.168.1.49", "192.168.1.50", "192.168.1.51", "192.168.1.52", "192.168.1.53", "192.168.1.54", "192.168.1.55", "192.168.1.56", "192.168.1.57", "192.168.1.58", "192.168.1.59", "192.168.1.60", "192.168.1.61", "192.168.1.62", "192.168.1.63", "192.168.1.64", "192.168.1.65", "192.168.1.66", "192.168.1.67", "192.168.1.68", "192.168.1.69", "192.168.1.70", "192.168.1.71", "192.168.1.72", "192.168.1.73", "192.168.1.74", "192.168.1.75", "192.168.1.76", "192.168.1.77", "192.168.1.78", "192.168.1.79", "192.168.1.80", "192.168.1.81", "192.168.1.82", "192.168.1.83", "192.168.1.84", "192.168.1.85", "192.168.1.86", "192.168.1.87", "192.168.1.88", "192.168.1.89", "192.168.1.90", "192.168.1.91", "192.168.1.92", "192.168.1.93", "192.168.1.94", "192.168.1.95", "192.168.1.96", "192.168.1.97", "192.168.1.98", "192.168.1.99", "192.168.1.100", "192.168.1.101", "192.168.1.102", "192.168.1.103", "192.168.1.104", "192.168.1.105", "192.168.1.106", "192.168.1.107", "192.168.1.108", "192.168.1.109", "192.168.1.110", "192.168.1.111", "192.168.1.112", "192.168.1.113", "192.168.1.114", "192.168.1.115", "192.168.1.116", "192.168.1.117", "192.168.1.118", "192.168.1.119", "192.168.1.120", "192.168.1.121", "192.168.1.122", "192.168.1.123", "192.168.1.124", "192.168.1.125", "192.168.1.126", "192.168.1.127", "192.168.1.128", "192.168.1.129", "192.168.1.130", "192.168.1.131", "192.168.1.132", "192.168.1.133", "192.168.1.134", "192.168.1.135", "192.168.1.136", "192.168.1.137", "192.168.1.138", "192.168.1.139", "192.168.1.140", "192.168.1.141", "192.168.1.142", "192.168.1.143", "192.168.1.144", "192.168.1.145", "192.168.1.146", "192.168.1.147", "192.168.1.148", "192.168.1.149", "192.168.1.150", "192.168.1.151", "192.168.1.152", "192.168.1.153", "192.168.1.154", "192.168.1.155", "192.168.1.156", "192.168.1.157", "192.168.1.158", "192.168.1.159", "192.168.1.160", "192.168.1.161", "192.168.1.162", "192.168.1.163", "192.168.1.164", "192.168.1.165", "192.168.1.166", "192.168.1.167", "192.168.1.168", "192.168.1.169", "192.168.1.170", "192.168.1.171", "192.168.1.172", "192.168.1.173", "192.168.1.174", "192.168.1.175", "192.168.1.176", "192.168.1.177", "192.168.1.178", "192.168.1.179", "192.168.1.180", "192.168.1.181", "192.168.1.182", "192.168.1.183", "192.168.1.184", "192.168.1.185", "192.168.1.186", "192.168.1.187", "192.168.1.188", "192.168.1.189", "192.168.1.190", "192.168.1.191", "192.168.1.192", "192.168.1.193", "192.168.1.194", "192.168.1.195", "192.168.1.196", "192.168.1.197", "192.168.1.198", "192.168.1.199", "192.168.1.200", "192.168.1.201", "192.168.1.202", "192.168.1.203", "192.168.1.204", "192.168.1.205", "192.168.1.206", "192.168.1.207", "192.168.1.208", "192.168.1.209", "192.168.1.210", "192.168.1.211", "192.168.1.212", "192.168.1.213", "192.168.1.214", "192.168.1.215", "192.168.1.216", "192.168.1.217", "192.168.1.218", "192.168.1.219", "192.168.1.220", "192.168.1.221", "192.168.1.222", "192.168.1.223", "192.168.1.224", "192.168.1.225", "192.168.1.226", "192.168.1.227", "192.168.1.228", "192.168.1.229", "192.168.1.230", "192.168.1.231", "192.168.1.232", "192.168.1.233", "192.168.1.234", "192.168.1.235", "192.168.1.236", "192.168.1.237", "192.168.1.238", "192.168.1.239", "192.168.1.240", "192.168.1.241", "192.168.1.242", "192.168.1.243", "192.168.1.244", "192.168.1.245", "192.168.1.246", "192.168.1.247", "192.168.1.248", "192.168.1.249", "192.168.1.250", "192.168.1.251", "192.168.1.252", "192.168.1.253", "192.168.1.254", "192.168.1.255"], "CTU-Malware-Capture-Botnet-111-1": ["10.0.2.110"], "CTU-Malware-Capture-Botnet-112-4": [""], "CTU-Malware-Capture-Botnet-78-2": ["10.0.2.108"], "CTU-Malware-Capture-Botnet-266-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-205-2": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-112-2": [""], "CTU-Malware-Capture-Botnet-112-1": [""], "CTU-Malware-Capture-Botnet-257-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-203-1": ["192.168.1.118"], "CTU-Malware-Capture-Botnet-261-3": ["192.168.1.124"], "CTU-Malware-Capture-Botnet-261-2": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-261-1": ["192.168.1.125"], "CTU-Malware-Capture-Botnet-163-1": ["10.0.2.106"], "CTU-Malware-Capture-Botnet-339-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-261-4": ["192.168.1.0", "192.168.1.1", "192.168.1.2", "192.168.1.3", "192.168.1.4", "192.168.1.5", "192.168.1.6", "192.168.1.7", "192.168.1.8", "192.168.1.9", "192.168.1.10", "192.168.1.11", "192.168.1.12", "192.168.1.13", "192.168.1.14", "192.168.1.15", "192.168.1.16", "192.168.1.17", "192.168.1.18", "192.168.1.19", "192.168.1.20", "192.168.1.21", "192.168.1.22", "192.168.1.23", "192.168.1.24", "192.168.1.25", "192.168.1.26", "192.168.1.27", "192.168.1.28", "192.168.1.29", "192.168.1.30", "192.168.1.31", "192.168.1.32", "192.168.1.33", "192.168.1.34", "192.168.1.35", "192.168.1.36", "192.168.1.37", "192.168.1.38", "192.168.1.39", "192.168.1.40", "192.168.1.41", "192.168.1.42", "192.168.1.43", "192.168.1.44", "192.168.1.45", "192.168.1.46", "192.168.1.47", "192.168.1.48", "192.168.1.49", "192.168.1.50", "192.168.1.51", "192.168.1.52", "192.168.1.53", "192.168.1.54", "192.168.1.55", "192.168.1.56", "192.168.1.57", "192.168.1.58", "192.168.1.59", "192.168.1.60", "192.168.1.61", "192.168.1.62", "192.168.1.63", "192.168.1.64", "192.168.1.65", "192.168.1.66", "192.168.1.67", "192.168.1.68", "192.168.1.69", "192.168.1.70", "192.168.1.71", "192.168.1.72", "192.168.1.73", "192.168.1.74", "192.168.1.75", "192.168.1.76", "192.168.1.77", "192.168.1.78", "192.168.1.79", "192.168.1.80", "192.168.1.81", "192.168.1.82", "192.168.1.83", "192.168.1.84", "192.168.1.85", "192.168.1.86", "192.168.1.87", "192.168.1.88", "192.168.1.89", "192.168.1.90", "192.168.1.91", "192.168.1.92", "192.168.1.93", "192.168.1.94", "192.168.1.95", "192.168.1.96", "192.168.1.97", "192.168.1.98", "192.168.1.99", "192.168.1.100", "192.168.1.101", "192.168.1.102", "192.168.1.103", "192.168.1.104", "192.168.1.105", "192.168.1.106", "192.168.1.107", "192.168.1.108", "192.168.1.109", "192.168.1.110", "192.168.1.111", "192.168.1.112", "192.168.1.113", "192.168.1.114", "192.168.1.115", "192.168.1.116", "192.168.1.117", "192.168.1.118", "192.168.1.119", "192.168.1.120", "192.168.1.121", "192.168.1.122", "192.168.1.123", "192.168.1.124", "192.168.1.125", "192.168.1.126", "192.168.1.127", "192.168.1.128", "192.168.1.129", "192.168.1.130", "192.168.1.131", "192.168.1.132", "192.168.1.133", "192.168.1.134", "192.168.1.135", "192.168.1.136", "192.168.1.137", "192.168.1.138", "192.168.1.139", "192.168.1.140", "192.168.1.141", "192.168.1.142", "192.168.1.143", "192.168.1.144", "192.168.1.145", "192.168.1.146", "192.168.1.147", "192.168.1.148", "192.168.1.149", "192.168.1.150", "192.168.1.151", "192.168.1.152", "192.168.1.153", "192.168.1.154", "192.168.1.155", "192.168.1.156", "192.168.1.157", "192.168.1.158", "192.168.1.159", "192.168.1.160", "192.168.1.161", "192.168.1.162", "192.168.1.163", "192.168.1.164", "192.168.1.165", "192.168.1.166", "192.168.1.167", "192.168.1.168", "192.168.1.169", "192.168.1.170", "192.168.1.171", "192.168.1.172", "192.168.1.173", "192.168.1.174", "192.168.1.175", "192.168.1.176", "192.168.1.177", "192.168.1.178", "192.168.1.179", "192.168.1.180", "192.168.1.181", "192.168.1.182", "192.168.1.183", "192.168.1.184", "192.168.1.185", "192.168.1.186", "192.168.1.187", "192.168.1.188", "192.168.1.189", "192.168.1.190", "192.168.1.191", "192.168.1.192", "192.168.1.193", "192.168.1.194", "192.168.1.195", "192.168.1.196", "192.168.1.197", "192.168.1.198", "192.168.1.199", "192.168.1.200", "192.168.1.201", "192.168.1.202", "192.168.1.203", "192.168.1.204", "192.168.1.205", "192.168.1.206", "192.168.1.207", "192.168.1.208", "192.168.1.209", "192.168.1.210", "192.168.1.211", "192.168.1.212", "192.168.1.213", "192.168.1.214", "192.168.1.215", "192.168.1.216", "192.168.1.217", "192.168.1.218", "192.168.1.219", "192.168.1.220", "192.168.1.221", "192.168.1.222", "192.168.1.223", "192.168.1.224", "192.168.1.225", "192.168.1.226", "192.168.1.227", "192.168.1.228", "192.168.1.229", "192.168.1.230", "192.168.1.231", "192.168.1.232", "192.168.1.233", "192.168.1.234", "192.168.1.235", "192.168.1.236", "192.168.1.237", "192.168.1.238", "192.168.1.239", "192.168.1.240", "192.168.1.241", "192.168.1.242", "192.168.1.243", "192.168.1.244", "192.168.1.245", "192.168.1.246", "192.168.1.247", "192.168.1.248", "192.168.1.249", "192.168.1.250", "192.168.1.251", "192.168.1.252", "192.168.1.253", "192.168.1.254", "192.168.1.255"], "CTU-Malware-Capture-Botnet-145-1": [""], "CTU-Malware-Capture-Botnet-17-2": ["10.0.2.119"], "CTU-Malware-Capture-Botnet-17-1": ["10.0.2.118"], "CTU-Malware-Capture-Botnet-123-1": [""], "CTU-Malware-Capture-Botnet-341-1": ["192.168.1.134"], "CTU-Malware-Capture-Botnet-164-1": ["10.0.2.200"], "CTU-Malware-Capture-Botnet-110-1": [""], "CTU-Malware-Capture-Botnet-270-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-137-1": [""], "CTU-Malware-Capture-Botnet-169-1": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-169-2": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-169-3": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-281-1": ["192.168.1.123"], "CTU-Malware-Capture-Botnet-349-1": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-243-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-230-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-230-2": ["192.168.1.117"], "CTU-Malware-Capture-Botnet-179-1": ["10.0.2.113"], "CTU-Malware-Capture-Botnet-282-1": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-350-1": ["192.168.1.126"], "CTU-Malware-Capture-Botnet-244-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-177-1": ["10.0.2.11"], "CTU-Malware-Capture-Botnet-174-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-247-1": ["192.168.1.124"], "CTU-Malware-Capture-Botnet-297-1": ["192.168.1.135"], "CTU-Malware-Capture-Botnet-306-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-142-1": [""], "CTU-Malware-Capture-Botnet-239-1": ["192.168.1.117"], "CTU-Malware-Capture-Botnet-141-2": [""], "CTU-Malware-Capture-Botnet-301-1": ["192.168.1.0", "192.168.1.1", "192.168.1.2", "192.168.1.3", "192.168.1.4", "192.168.1.5", "192.168.1.6", "192.168.1.7", "192.168.1.8", "192.168.1.9", "192.168.1.10", "192.168.1.11", "192.168.1.12", "192.168.1.13", "192.168.1.14", "192.168.1.15", "192.168.1.16", "192.168.1.17", "192.168.1.18", "192.168.1.19", "192.168.1.20", "192.168.1.21", "192.168.1.22", "192.168.1.23", "192.168.1.24", "192.168.1.25", "192.168.1.26", "192.168.1.27", "192.168.1.28", "192.168.1.29", "192.168.1.30", "192.168.1.31", "192.168.1.32", "192.168.1.33", "192.168.1.34", "192.168.1.35", "192.168.1.36", "192.168.1.37", "192.168.1.38", "192.168.1.39", "192.168.1.40", "192.168.1.41", "192.168.1.42", "192.168.1.43", "192.168.1.44", "192.168.1.45", "192.168.1.46", "192.168.1.47", "192.168.1.48", "192.168.1.49", "192.168.1.50", "192.168.1.51", "192.168.1.52", "192.168.1.53", "192.168.1.54", "192.168.1.55", "192.168.1.56", "192.168.1.57", "192.168.1.58", "192.168.1.59", "192.168.1.60", "192.168.1.61", "192.168.1.62", "192.168.1.63", "192.168.1.64", "192.168.1.65", "192.168.1.66", "192.168.1.67", "192.168.1.68", "192.168.1.69", "192.168.1.70", "192.168.1.71", "192.168.1.72", "192.168.1.73", "192.168.1.74", "192.168.1.75", "192.168.1.76", "192.168.1.77", "192.168.1.78", "192.168.1.79", "192.168.1.80", "192.168.1.81", "192.168.1.82", "192.168.1.83", "192.168.1.84", "192.168.1.85", "192.168.1.86", "192.168.1.87", "192.168.1.88", "192.168.1.89", "192.168.1.90", "192.168.1.91", "192.168.1.92", "192.168.1.93", "192.168.1.94", "192.168.1.95", "192.168.1.96", "192.168.1.97", "192.168.1.98", "192.168.1.99", "192.168.1.100", "192.168.1.101", "192.168.1.102", "192.168.1.103", "192.168.1.104", "192.168.1.105", "192.168.1.106", "192.168.1.107", "192.168.1.108", "192.168.1.109", "192.168.1.110", "192.168.1.111", "192.168.1.112", "192.168.1.113", "192.168.1.114", "192.168.1.115", "192.168.1.116", "192.168.1.117", "192.168.1.118", "192.168.1.119", "192.168.1.120", "192.168.1.121", "192.168.1.122", "192.168.1.123", "192.168.1.124", "192.168.1.125", "192.168.1.126", "192.168.1.127", "192.168.1.128", "192.168.1.129", "192.168.1.130", "192.168.1.131", "192.168.1.132", "192.168.1.133", "192.168.1.134", "192.168.1.135", "192.168.1.136", "192.168.1.137", "192.168.1.138", "192.168.1.139", "192.168.1.140", "192.168.1.141", "192.168.1.142", "192.168.1.143", "192.168.1.144", "192.168.1.145", "192.168.1.146", "192.168.1.147", "192.168.1.148", "192.168.1.149", "192.168.1.150", "192.168.1.151", "192.168.1.152", "192.168.1.153", "192.168.1.154", "192.168.1.155", "192.168.1.156", "192.168.1.157", "192.168.1.158", "192.168.1.159", "192.168.1.160", "192.168.1.161", "192.168.1.162", "192.168.1.163", "192.168.1.164", "192.168.1.165", "192.168.1.166", "192.168.1.167", "192.168.1.168", "192.168.1.169", "192.168.1.170", "192.168.1.171", "192.168.1.172", "192.168.1.173", "192.168.1.174", "192.168.1.175", "192.168.1.176", "192.168.1.177", "192.168.1.178", "192.168.1.179", "192.168.1.180", "192.168.1.181", "192.168.1.182", "192.168.1.183", "192.168.1.184", "192.168.1.185", "192.168.1.186", "192.168.1.187", "192.168.1.188", "192.168.1.189", "192.168.1.190", "192.168.1.191", "192.168.1.192", "192.168.1.193", "192.168.1.194", "192.168.1.195", "192.168.1.196", "192.168.1.197", "192.168.1.198", "192.168.1.199", "192.168.1.200", "192.168.1.201", "192.168.1.202", "192.168.1.203", "192.168.1.204", "192.168.1.205", "192.168.1.206", "192.168.1.207", "192.168.1.208", "192.168.1.209", "192.168.1.210", "192.168.1.211", "192.168.1.212", "192.168.1.213", "192.168.1.214", "192.168.1.215", "192.168.1.216", "192.168.1.217", "192.168.1.218", "192.168.1.219", "192.168.1.220", "192.168.1.221", "192.168.1.222", "192.168.1.223", "192.168.1.224", "192.168.1.225", "192.168.1.226", "192.168.1.227", "192.168.1.228", "192.168.1.229", "192.168.1.230", "192.168.1.231", "192.168.1.232", "192.168.1.233", "192.168.1.234", "192.168.1.235", "192.168.1.236", "192.168.1.237", "192.168.1.238", "192.168.1.239", "192.168.1.240", "192.168.1.241", "192.168.1.242", "192.168.1.243", "192.168.1.244", "192.168.1.245", "192.168.1.246", "192.168.1.247", "192.168.1.248", "192.168.1.249", "192.168.1.250", "192.168.1.251", "192.168.1.252", "192.168.1.253", "192.168.1.254", "192.168.1.255"], "CTU-Malware-Capture-Botnet-141-1": [""], "CTU-Malware-Capture-Botnet-69": ["10.0.2.117"], "CTU-Malware-Capture-Botnet-273-1": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-295-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-184-1": ["192.168.1.102"], "CTU-Malware-Capture-Botnet-138-1": ["54.242.92.108", "10.0.0.151", "10.0.0.152", "10.0.0.153"], "CTU-Malware-Capture-Botnet-279-1": ["192.168.1.130"], "CTU-Malware-Capture-Botnet-25-4": ["10.0.2.103"], "CTU-Malware-Capture-Botnet-25-5": ["10.0.2.103"], "CTU-Malware-Capture-Botnet-25-6": ["10.0.2.103"], "CTU-Malware-Capture-Botnet-249-1": ["192.168.1.130"], "CTU-Malware-Capture-Botnet-25-1": ["10.0.2.106"], "CTU-Malware-Capture-Botnet-25-2": ["10.0.2.103"], "CTU-Malware-Capture-Botnet-25-3": ["10.0.2.103"], "CTU-Malware-Capture-Botnet-189-2": ["192.168.1.127"], "CTU-Malware-Capture-Botnet-189-1": ["192.168.1.117"], "CTU-Malware-Capture-Botnet-292-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-352-1": ["192.168.1.129"], "CTU-Malware-Capture-Botnet-31-1": ["10.0.2.110"], "CTU-Malware-Capture-Botnet-345-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-181-1": ["10.0.2.116"], "CTU-Malware-Capture-Botnet-183-1": ["192.168.1.102"], "CTU-Malware-Capture-Botnet-205-1": ["192.168.1.117"], "CTU-Malware-Capture-Botnet-305-2": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-324-1": ["192.168.1.123"], "CTU-Malware-Capture-Botnet-129-1": [""], "CTU-Malware-Capture-Botnet-264-1": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-90": ["192.168.3.104"], "CTU-Malware-Capture-Botnet-346-1": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-224-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-78-1": ["10.0.2.108"], "CTU-Malware-Capture-Botnet-318-1": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-199-1": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-194-1": ["192.168.1.125"], "CTU-Malware-Capture-Botnet-199-2": ["192.168.1.127"], "CTU-Malware-Capture-Botnet-274-1": ["192.168.1.123"], "CTU-Malware-Capture-Botnet-315-1": ["192.168.1.124"], "CTU-Malware-Capture-Botnet-219-2": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-219-3": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-219-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-277-1": ["192.168.1.128"], "CTU-Malware-Capture-Botnet-294-1": ["192.168.1.135"], "CTU-Malware-Capture-Botnet-291-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-327-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-327-2": ["192.168.113"], "CTU-Malware-Capture-Botnet-110-4": [""], "CTU-Malware-Capture-Botnet-110-5": [""], "CTU-Malware-Capture-Botnet-110-6": [""], "CTU-Malware-Capture-Botnet-220-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-293-1": ["192.168.1.135"], "CTU-Malware-Capture-Botnet-110-2": [""], "CTU-Malware-Capture-Botnet-211-2": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-211-1": ["192.168.1.126"], "CTU-Malware-Capture-Botnet-213-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-305-1": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-320-2": ["192.168.1.127"], "CTU-Malware-Capture-Botnet-320-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-290-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-251-1": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-201-1": ["192.168.1.125"], "CTU-Malware-Capture-Botnet-253-1": ["192.168.1.120"], "CTU-Malware-Capture-Botnet-209-1": ["192.168.1.123"], "CTU-Malware-Capture-Botnet-204-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-263-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-162-2": [""], "CTU-Malware-Capture-Botnet-162-1": [""], "CTU-Malware-Capture-Botnet-260-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-287-1": ["192.168.1.135"], "CTU-Malware-Capture-Botnet-334-1": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-323-1": ["192.168.1.127"], "CTU-Malware-Capture-Botnet-1": ["10.0.2.22", "10.0.2.112"], "CTU-Malware-Capture-Botnet-280-1": ["192.168.1.106"], "CTU-Malware-Capture-Botnet-202-1": ["192.168.1.130"], "CTU-Malware-Capture-Botnet-200-1": ["192.168.1.124"], "CTU-Malware-Capture-Botnet-120-1": [""], "CTU-Malware-Capture-Botnet-144-1": [""], "CTU-Malware-Capture-Botnet-143-1": [""], "CTU-Malware-Capture-Botnet-242-1": ["192.168.1.220"], "CTU-Malware-Capture-Botnet-240-1": ["192.168.1.118"], "CTU-Malware-Capture-Botnet-232-1": ["192.168.1.128"], "CTU-Malware-Capture-Botnet-178-1": ["10.0.2.112"], "CTU-Malware-Capture-Botnet-231-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-175-1": ["10.0.2.109"], "CTU-Malware-Capture-Botnet-340-1": ["192.168.1.118"], "CTU-Malware-Capture-Botnet-302-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-245-1": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-248-1": ["192.168.1.128"], "CTU-Malware-Capture-Botnet-299-1": ["192.168.1.135"], "CTU-Malware-Capture-Botnet-246-1": ["192.168.1.110"], "CTU-Malware-Capture-Botnet-238-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-328-1": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-193-2": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-237-1": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-348-1": ["192.168.1.130"], "CTU-Malware-Capture-Botnet-235-1": ["192.168.1.126"], "CTU-Malware-Capture-Botnet-140-2": [""], "CTU-Malware-Capture-Botnet-140-1": [""], "CTU-Malware-Capture-Botnet-196-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-157-1": ["222.179.116.23"], "CTU-Malware-Capture-Botnet-186-1": ["192.168.1.128"], "CTU-Malware-Capture-Botnet-254-1": ["192.168.1.123"], "CTU-Malware-Capture-Botnet-217-1": ["192.168.1.118"], "CTU-Malware-Capture-Botnet-308-1": ["192.168.1.117"], "CTU-Malware-Capture-Botnet-241-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-329-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-102": ["10.0.2.102"], "CTU-Malware-Capture-Botnet-180-1": ["10.0.2.115"], "CTU-Malware-Capture-Botnet-185-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-300-1": ["192.168.1.106"], "CTU-Malware-Capture-Botnet-188-4": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-188-3": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-188-2": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-188-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-303-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-296-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-278-1": ["192.168.1.129"], "CTU-Malware-Capture-Botnet-275-1": ["192.168.1.124"], "CTU-Malware-Capture-Botnet-325-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-267-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-326-1": ["192.168.1.120"], "CTU-Malware-Capture-Botnet-153-1": [""], "CTU-Malware-Capture-Botnet-198-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-354-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-344-1": ["192.168.1.120"], "CTU-Malware-Capture-Botnet-83-1": ["10.0.2.102"], "CTU-Malware-Capture-Botnet-83-2": ["10.0.2.102"], "CTU-Malware-Capture-Botnet-228-1": ["192.168.1.123"], "CTU-Malware-Capture-Botnet-195-1": ["- Infected host: "], "CTU-Malware-Capture-Botnet-187-1": ["192.168.1.110"], "CTU-Malware-Capture-Botnet-215-2": ["192.168.1.129"], "CTU-Malware-Capture-Botnet-222-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-193-1": ["192.168.1.130"], "CTU-Malware-Capture-Botnet-215-1": ["192.168.1.118"], "CTU-Malware-Capture-Botnet-317-1": [""], "CTU-Malware-Capture-Botnet-210-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-218-1": ["192.168.1.129"], "CTU-Malware-Capture-Botnet-285-1": ["192.168.1.135"], "CTU-Malware-Capture-Botnet-265-1": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-111-5": [""], "CTU-Malware-Capture-Botnet-333-1": [""]} -------------------------------------------------------------------------------- /dataset_tools/label_mcfp_datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import config as c 3 | import json 4 | 5 | def check_conn_label(dataset_path, normal_ips, infected_ips): 6 | print "<< Labeling " + dataset_path 7 | flow_array = [] 8 | space = '\t' 9 | normal_label = 0 10 | malware_label = 0 11 | with open(dataset_path + '/bro/conn.log', 'r') as f: 12 | for line in f: 13 | newline = line 14 | 15 | if line[0] != '#': 16 | split = line.split('\t') 17 | src_address = split[2] 18 | 19 | if src_address in normal_ips: 20 | newline = line.rstrip() + space + "From-Normal" + "\n" 21 | normal_label += 1 22 | elif src_address in infected_ips: 23 | newline = line.rstrip() + space + "From-Botnet" + "\n" 24 | malware_label += 1 25 | else: 26 | if 'fields' in line: 27 | newline = line.rstrip() + space + "label" + "\n" 28 | elif 'types' in line: 29 | newline = line.rstrip() + space + "string" + "\n" 30 | 31 | flow_array.append(newline) 32 | 33 | if "#close" in line: 34 | break 35 | 36 | print "normals:", normal_label 37 | print "malwares:", malware_label 38 | print " << End Labeling " + dataset_path 39 | return flow_array 40 | 41 | def write_conn(path, flow_array): 42 | print "<< Writing new flows to " + path 43 | index = 0 44 | with open(path + '/bro/conn_label.log', 'w') as f: 45 | for i in range(len(flow_array)): 46 | f.write(flow_array[i]) 47 | index += 1 48 | print " << Number of lines:", index 49 | print "<< New file conn_label.log was succesfly created." 50 | 51 | if __name__ == '__main__': 52 | 53 | with open('./infected_ips.json', 'r') as f: 54 | infected_ips = json.load(f) 55 | 56 | with open('./normal_ips.json', 'r') as f: 57 | normal_ips = json.load(f) 58 | 59 | for sub_set in os.listdir(c.datasets_folder_general): 60 | if sub_set.startswith("CTU-Malware-Capture-Botnet-") : 61 | dataset_number = int(sub_set.split('-')[4]) 62 | if (dataset_number <= 42 or dataset_number >= 54) \ 63 | and (sub_set in infected_ips or sub_set in normal_ips): 64 | flow_array = check_conn_label(c.datasets_folder_general + sub_set, normal_ips[sub_set], infected_ips[sub_set]) 65 | write_conn(c.datasets_folder_general + sub_set, flow_array) 66 | -------------------------------------------------------------------------------- /dataset_tools/label_normal_datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import config as c 3 | 4 | 5 | def check_conn_label(dataset_path): 6 | print "<< Labeling " + dataset_path 7 | flow_array = [] 8 | space = ' ' 9 | normal_label = 0 10 | with open(dataset_path + '/bro/conn.log', 'r') as f: 11 | for line in f: 12 | newline = line 13 | if not ('#' == line[0]): 14 | newline = line.rstrip() + space + "From-Normal" + "\n" 15 | normal_label += 1 16 | else: 17 | if 'fields' in line: 18 | newline = line.rstrip() + space + "label" + "\n" 19 | elif 'types' in line: 20 | newline = line.rstrip() + space + "string" + "\n" 21 | 22 | flow_array.append(newline) 23 | 24 | print "normals:", normal_label 25 | print " << End Labeling " + dataset_path 26 | return flow_array 27 | 28 | def write_conn(path, flow_array): 29 | print "<< Writing new flows to " + path 30 | index = 0 31 | with open(path + '/bro/conn_label.log', 'w') as f: 32 | for i in range(len(flow_array)): 33 | f.write(flow_array[i]) 34 | index += 1 35 | print " << Number of lines:", index 36 | print "<< New file conn_label.log was succesfly created." 37 | 38 | if __name__ == '__main__': 39 | for sub_set in os.listdir(c.datasets_folder): 40 | if not sub_set.startswith(".") and sub_set.startswith("CTU-Normal"): 41 | flow_array = check_conn_label(c.datasets_folder + sub_set) 42 | write_conn(c.datasets_folder + sub_set, flow_array) 43 | -------------------------------------------------------------------------------- /dataset_tools/normal_ips.json: -------------------------------------------------------------------------------- 1 | {"CTU-Malware-Capture-Botnet-116-4": [""], "CTU-Malware-Capture-Botnet-336-1": [""], "CTU-Malware-Capture-Botnet-116-2": [""], "CTU-Malware-Capture-Botnet-221-2": [""], "CTU-Malware-Capture-Botnet-221-1": [""], "CTU-Malware-Capture-Botnet-116-1": ["46.105.227.94"], "CTU-Malware-Capture-Botnet-117-1": [""], "CTU-Malware-Capture-Botnet-322-1": [""], "CTU-Malware-Capture-Botnet-335-1": [""], "CTU-Malware-Capture-Botnet-227-1": [""], "CTU-Malware-Capture-Botnet-208-2": [""], "CTU-Malware-Capture-Botnet-321-1": [""], "CTU-Malware-Capture-Botnet-111-1": [""], "CTU-Malware-Capture-Botnet-112-4": [""], "CTU-Malware-Capture-Botnet-78-2": [""], "CTU-Malware-Capture-Botnet-266-1": [""], "CTU-Malware-Capture-Botnet-205-2": [""], "CTU-Malware-Capture-Botnet-112-2": [""], "CTU-Malware-Capture-Botnet-112-1": [""], "CTU-Malware-Capture-Botnet-257-1": [""], "CTU-Malware-Capture-Botnet-203-1": [""], "CTU-Malware-Capture-Botnet-261-3": [""], "CTU-Malware-Capture-Botnet-261-2": [""], "CTU-Malware-Capture-Botnet-261-1": [""], "CTU-Malware-Capture-Botnet-163-1": [""], "CTU-Malware-Capture-Botnet-339-1": [""], "CTU-Malware-Capture-Botnet-261-4": [""], "CTU-Malware-Capture-Botnet-145-1": [""], "CTU-Malware-Capture-Botnet-17-2": [""], "CTU-Malware-Capture-Botnet-17-1": [""], "CTU-Malware-Capture-Botnet-123-1": [""], "CTU-Malware-Capture-Botnet-341-1": [""], "CTU-Malware-Capture-Botnet-164-1": [""], "CTU-Malware-Capture-Botnet-110-1": [""], "CTU-Malware-Capture-Botnet-270-1": [""], "CTU-Malware-Capture-Botnet-137-1": [""], "CTU-Malware-Capture-Botnet-169-1": [""], "CTU-Malware-Capture-Botnet-169-2": [""], "CTU-Malware-Capture-Botnet-169-3": [""], "CTU-Malware-Capture-Botnet-281-1": [""], "CTU-Malware-Capture-Botnet-349-1": [""], "CTU-Malware-Capture-Botnet-243-1": [""], "CTU-Malware-Capture-Botnet-230-1": [""], "CTU-Malware-Capture-Botnet-230-2": [""], "CTU-Malware-Capture-Botnet-179-1": [""], "CTU-Malware-Capture-Botnet-282-1": [""], "CTU-Malware-Capture-Botnet-350-1": [""], "CTU-Malware-Capture-Botnet-244-1": [""], "CTU-Malware-Capture-Botnet-177-1": [""], "CTU-Malware-Capture-Botnet-174-1": [""], "CTU-Malware-Capture-Botnet-247-1": [""], "CTU-Malware-Capture-Botnet-297-1": [""], "CTU-Malware-Capture-Botnet-306-1": [""], "CTU-Malware-Capture-Botnet-142-1": [""], "CTU-Malware-Capture-Botnet-239-1": [""], "CTU-Malware-Capture-Botnet-141-2": [""], "CTU-Malware-Capture-Botnet-301-1": [""], "CTU-Malware-Capture-Botnet-141-1": [""], "CTU-Malware-Capture-Botnet-69": [""], "CTU-Malware-Capture-Botnet-273-1": [""], "CTU-Malware-Capture-Botnet-295-1": [""], "CTU-Malware-Capture-Botnet-184-1": [""], "CTU-Malware-Capture-Botnet-138-1": [""], "CTU-Malware-Capture-Botnet-279-1": [""], "CTU-Malware-Capture-Botnet-25-4": [""], "CTU-Malware-Capture-Botnet-25-5": [""], "CTU-Malware-Capture-Botnet-25-6": [""], "CTU-Malware-Capture-Botnet-249-1": [""], "CTU-Malware-Capture-Botnet-25-1": [""], "CTU-Malware-Capture-Botnet-25-2": [""], "CTU-Malware-Capture-Botnet-25-3": [""], "CTU-Malware-Capture-Botnet-189-2": [""], "CTU-Malware-Capture-Botnet-189-1": [""], "CTU-Malware-Capture-Botnet-292-1": [""], "CTU-Malware-Capture-Botnet-352-1": [""], "CTU-Malware-Capture-Botnet-31-1": [""], "CTU-Malware-Capture-Botnet-345-1": [""], "CTU-Malware-Capture-Botnet-181-1": [""], "CTU-Malware-Capture-Botnet-183-1": [""], "CTU-Malware-Capture-Botnet-205-1": [""], "CTU-Malware-Capture-Botnet-305-2": [""], "CTU-Malware-Capture-Botnet-324-1": [""], "CTU-Malware-Capture-Botnet-129-1": [""], "CTU-Malware-Capture-Botnet-264-1": [""], "CTU-Malware-Capture-Botnet-90": [""], "CTU-Malware-Capture-Botnet-346-1": [""], "CTU-Malware-Capture-Botnet-224-1": [""], "CTU-Malware-Capture-Botnet-78-1": [""], "CTU-Malware-Capture-Botnet-318-1": [""], "CTU-Malware-Capture-Botnet-199-1": [""], "CTU-Malware-Capture-Botnet-194-1": [""], "CTU-Malware-Capture-Botnet-199-2": [""], "CTU-Malware-Capture-Botnet-274-1": [""], "CTU-Malware-Capture-Botnet-315-1": [""], "CTU-Malware-Capture-Botnet-219-2": [""], "CTU-Malware-Capture-Botnet-219-3": [""], "CTU-Malware-Capture-Botnet-219-1": [""], "CTU-Malware-Capture-Botnet-277-1": [""], "CTU-Malware-Capture-Botnet-294-1": [""], "CTU-Malware-Capture-Botnet-291-1": [""], "CTU-Malware-Capture-Botnet-327-1": [""], "CTU-Malware-Capture-Botnet-327-2": [""], "CTU-Malware-Capture-Botnet-110-4": [""], "CTU-Malware-Capture-Botnet-110-5": [""], "CTU-Malware-Capture-Botnet-110-6": [""], "CTU-Malware-Capture-Botnet-220-1": [""], "CTU-Malware-Capture-Botnet-293-1": [""], "CTU-Malware-Capture-Botnet-110-2": [""], "CTU-Malware-Capture-Botnet-211-2": [""], "CTU-Malware-Capture-Botnet-211-1": [""], "CTU-Malware-Capture-Botnet-213-1": [""], "CTU-Malware-Capture-Botnet-305-1": [""], "CTU-Malware-Capture-Botnet-320-2": [""], "CTU-Malware-Capture-Botnet-320-1": [""], "CTU-Malware-Capture-Botnet-290-1": [""], "CTU-Malware-Capture-Botnet-251-1": [""], "CTU-Malware-Capture-Botnet-201-1": [""], "CTU-Malware-Capture-Botnet-253-1": [""], "CTU-Malware-Capture-Botnet-209-1": [""], "CTU-Malware-Capture-Botnet-204-1": [""], "CTU-Malware-Capture-Botnet-263-1": [""], "CTU-Malware-Capture-Botnet-162-2": [""], "CTU-Malware-Capture-Botnet-162-1": [""], "CTU-Malware-Capture-Botnet-260-1": [""], "CTU-Malware-Capture-Botnet-287-1": [""], "CTU-Malware-Capture-Botnet-334-1": [""], "CTU-Malware-Capture-Botnet-323-1": [""], "CTU-Malware-Capture-Botnet-238-1": [""], "CTU-Malware-Capture-Botnet-280-1": [""], "CTU-Malware-Capture-Botnet-202-1": [""], "CTU-Malware-Capture-Botnet-200-1": [""], "CTU-Malware-Capture-Botnet-120-1": [""], "CTU-Malware-Capture-Botnet-144-1": [""], "CTU-Malware-Capture-Botnet-143-1": [""], "CTU-Malware-Capture-Botnet-242-1": [""], "CTU-Malware-Capture-Botnet-240-1": [""], "CTU-Malware-Capture-Botnet-232-1": [""], "CTU-Malware-Capture-Botnet-178-1": [""], "CTU-Malware-Capture-Botnet-231-1": [""], "CTU-Malware-Capture-Botnet-175-1": [""], "CTU-Malware-Capture-Botnet-340-1": [""], "CTU-Malware-Capture-Botnet-302-1": [""], "CTU-Malware-Capture-Botnet-245-1": [""], "CTU-Malware-Capture-Botnet-248-1": [""], "CTU-Malware-Capture-Botnet-299-1": [""], "CTU-Malware-Capture-Botnet-246-1": [""], "CTU-Malware-Capture-Botnet-1": [""], "CTU-Malware-Capture-Botnet-328-1": [""], "CTU-Malware-Capture-Botnet-222-1": [""], "CTU-Malware-Capture-Botnet-237-1": [""], "CTU-Malware-Capture-Botnet-348-1": [""], "CTU-Malware-Capture-Botnet-235-1": [""], "CTU-Malware-Capture-Botnet-140-2": [""], "CTU-Malware-Capture-Botnet-140-1": [""], "CTU-Malware-Capture-Botnet-196-1": [""], "CTU-Malware-Capture-Botnet-157-1": [""], "CTU-Malware-Capture-Botnet-186-1": [""], "CTU-Malware-Capture-Botnet-254-1": [""], "CTU-Malware-Capture-Botnet-217-1": [""], "CTU-Malware-Capture-Botnet-308-1": [""], "CTU-Malware-Capture-Botnet-241-1": [""], "CTU-Malware-Capture-Botnet-329-1": [""], "CTU-Malware-Capture-Botnet-102": [""], "CTU-Malware-Capture-Botnet-180-1": [""], "CTU-Malware-Capture-Botnet-185-1": [""], "CTU-Malware-Capture-Botnet-300-1": [""], "CTU-Malware-Capture-Botnet-188-4": [""], "CTU-Malware-Capture-Botnet-188-3": [""], "CTU-Malware-Capture-Botnet-188-2": [""], "CTU-Malware-Capture-Botnet-188-1": [""], "CTU-Malware-Capture-Botnet-303-1": [""], "CTU-Malware-Capture-Botnet-296-1": [""], "CTU-Malware-Capture-Botnet-278-1": [""], "CTU-Malware-Capture-Botnet-275-1": [""], "CTU-Malware-Capture-Botnet-325-1": [""], "CTU-Malware-Capture-Botnet-267-1": [""], "CTU-Malware-Capture-Botnet-326-1": [""], "CTU-Malware-Capture-Botnet-153-1": [""], "CTU-Malware-Capture-Botnet-198-1": [""], "CTU-Malware-Capture-Botnet-354-1": [""], "CTU-Malware-Capture-Botnet-344-1": [""], "CTU-Malware-Capture-Botnet-83-1": [""], "CTU-Malware-Capture-Botnet-83-2": [""], "CTU-Malware-Capture-Botnet-228-1": [""], "CTU-Malware-Capture-Botnet-195-1": [""], "CTU-Malware-Capture-Botnet-187-1": [""], "CTU-Malware-Capture-Botnet-215-2": [""], "CTU-Malware-Capture-Botnet-193-2": [""], "CTU-Malware-Capture-Botnet-193-1": [""], "CTU-Malware-Capture-Botnet-215-1": [""], "CTU-Malware-Capture-Botnet-317-1": [""], "CTU-Malware-Capture-Botnet-210-1": [""], "CTU-Malware-Capture-Botnet-218-1": [""], "CTU-Malware-Capture-Botnet-285-1": [""], "CTU-Malware-Capture-Botnet-265-1": [""], "CTU-Malware-Capture-Botnet-111-5": [""], "CTU-Malware-Capture-Botnet-333-1":[""]} -------------------------------------------------------------------------------- /example_config.py: -------------------------------------------------------------------------------- 1 | # Only absolute paths 2 | # /!\ Don't forget "/" at the end for folders 3 | datasets_folder_general = "/Volumes/Data/datasets_general/folder/" 4 | datasets_folder = "absolute_path/to/my/datasets/folder/" 5 | datasets_discarded_folder = "absolute_path/to/my/discarded_datasets/folder/" 6 | 7 | results_folder = "absolute_path/to/my/results/folder/" 8 | results_folder_backup = "absolute_path/to/my/results_backup/folder/" 9 | model_folder = results_folder + "model/" 10 | logs_folder = results_folder + "logs/" 11 | graphs_folder = results_folder + "graphs/" 12 | 13 | alexa_folder = "absolute_path/to/my/alexa/folder/" 14 | top_level_domain_file = "absolute_path/to/my/top_level_domain/file" 15 | 16 | training_output_file = model_folder + "training_output_file.txt" -------------------------------------------------------------------------------- /features_extraction/CertificateFeatures.py: -------------------------------------------------------------------------------- 1 | 2 | class CertificateFeatures: 3 | 4 | def __init__(self, cert_serial, x509_line): 5 | self.servernames_dict = dict() 6 | self.cert_serial = cert_serial 7 | self.x509_line = x509_line 8 | self.malware_labels = 0 9 | self.normal_labels = 0 10 | 11 | self.not_valid_certificate_number = 0 12 | self.cert_percent_validity = [] 13 | self.is_CN_in_SAN_f = -1 14 | self.certificate_key_length = -1 15 | self.number_san_domains = 0 16 | self.number_x509_lines = 0 17 | 18 | self.process_certificate(x509_line) 19 | 20 | def process_certificate(self, x509_line): 21 | self.is_CN_in_SAN(x509_line) 22 | 23 | split = x509_line.split(' ') 24 | 25 | self.certificate_key_length = float(split[11]) 26 | 27 | # number of domain in san in x509 28 | if split[14] != '-': 29 | domains = len(split[14].split(',')) 30 | self.number_san_domains += domains 31 | 32 | def add_server_name(self, server_name, label): 33 | try: 34 | if self.servernames_dict[server_name]: 35 | pass 36 | except: 37 | self.servernames_dict[server_name] = 1 38 | 39 | if 'Botnet' in label: 40 | self.malware_labels += 1 41 | if 'Normal' in label: 42 | self.normal_labels += 1 43 | 44 | def contain_server_name(self, server_name): 45 | try: 46 | if self.servernames_dict[server_name]: 47 | return self.x509_line 48 | except: 49 | return 0 50 | 51 | def is_malware(self): 52 | if self.malware_labels != 0 and self.normal_labels != 0: 53 | print "Error: There are more malwares and more normals! Cert serial:", self.cert_serial 54 | print " " + "malwares:", self.malware_labels, "normals", self.normal_labels 55 | print " " + "SNI:" 56 | print self.servernames_dict.keys() 57 | 58 | if self.malware_labels > self.normal_labels: 59 | return True 60 | return False 61 | 62 | def add_x509_line(self, x509_line): 63 | split = x509_line.split(' ') 64 | 65 | if split[7] != '-' and split[6] != '-': 66 | try: 67 | current_time = float(split[0]) 68 | before_date = float(split[6]) 69 | after_date = float(split[7]) 70 | if current_time > after_date or current_time < before_date: 71 | self.not_valid_certificate_number += 1 72 | # print split[1], before_date, current_time, after_date 73 | 74 | # certificate ratio 75 | norm_after = after_date - before_date # 31622399 76 | current_time_norm = current_time - before_date # 12025263 77 | self.cert_percent_validity.append(current_time_norm / norm_after) 78 | 79 | self.number_x509_lines += 1 80 | except: 81 | print "Certificate time length is broken." 82 | 83 | 84 | def is_CN_in_SAN(self, x509_line): 85 | x509_split = x509_line.split(' ') 86 | if x509_split[14] != '-': 87 | CN_part = x509_split[4] 88 | SAN_dns_list = x509_split[14].split(',') 89 | for i in range(len(SAN_dns_list)): 90 | if '*' in SAN_dns_list[i]: 91 | SAN_dns_list[i] = SAN_dns_list[i].replace('*', '') 92 | hit_2 = 0 93 | for san_dns in SAN_dns_list: 94 | if san_dns in CN_part: 95 | hit_2 = 1 96 | break 97 | self.is_CN_in_SAN_f = hit_2 98 | 99 | def get_label_of_connection(self): 100 | if self.malware_labels > self.normal_labels: 101 | return "MALWARE" 102 | else: 103 | return "NORMAL" 104 | """ 105 | ------------- FEATERES --------------- 106 | """ 107 | # 1 CN is there 108 | # 0 is not there 109 | # -1 is not define 110 | def get_is_CN_in_SAN(self): 111 | return self.is_CN_in_SAN_f 112 | 113 | def get_certificate_key_length(self): 114 | return self.certificate_key_length 115 | 116 | def get_number_san_domains(self): 117 | return self.number_san_domains 118 | 119 | def get_number_of_server_name(self): 120 | return len(self.servernames_dict.keys()) 121 | 122 | def get_not_valid_certificate_number(self): 123 | if self.number_x509_lines != 0: 124 | return self.not_valid_certificate_number / float(self.number_x509_lines) 125 | return -1 126 | 127 | def get_certificate_ratio(self): 128 | if len(self.cert_percent_validity) != 0: 129 | temp = 0 130 | for i in self.cert_percent_validity: 131 | temp += i 132 | return temp / float(len(self.cert_percent_validity)) 133 | else: 134 | return -1 -------------------------------------------------------------------------------- /features_extraction/ComputeFeatures.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from ExtractFeatures import ExtractFeatures 3 | import config as c 4 | 5 | from logger import get_logger 6 | logger = get_logger("debug") 7 | 8 | class ComputeFeatures(ExtractFeatures): 9 | 10 | def __init__(self): 11 | super(ComputeFeatures, self).__init__() 12 | self.file_time_name = str(datetime.strftime(datetime.utcnow(), "%Y-%m-%d_%H-%M")) 13 | 14 | def add_cert_to_non_cert_conn(self): 15 | for key in self.connection_4_tuples.keys(): 16 | 17 | """ 18 | implementig feature: connection which have no certificate, but have at least one SNI, 19 | look, if in certificate_objects_dict is such servername with certificate 20 | """ 21 | break_v = 0 22 | if self.connection_4_tuples[key].get_amount_diff_certificates() == 0: 23 | 24 | server_names = self.connection_4_tuples[key].get_SNI_list() 25 | if len(server_names) != 0: 26 | for cert_serial in self.certificate_dict.keys(): 27 | for server_name in server_names: 28 | x509_line = self.certificate_dict[cert_serial].contain_server_name(server_name) 29 | if x509_line != 0: 30 | self.connection_4_tuples[key].add_ssl_log_2(x509_line) 31 | print "This Certificate was added after process:", "cert_serial:", cert_serial, "server_name=",server_name, "4-tuple=", key, "label:", self.connection_4_tuples[key].get_label_of_connection() 32 | break_v = 1 33 | break 34 | if break_v == 1: 35 | break 36 | 37 | def create_balanced_dataset(self): 38 | import csv 39 | from collections import OrderedDict 40 | 41 | botnet_lines = list() 42 | normal_lines = list() 43 | 44 | for key, con4tuple in self.connection_4_tuples.iteritems(): 45 | dest_ip = key[1] 46 | if dest_ip not in self.dns_connections_index: 47 | print dest_ip + "NOT FOUND IN DNS RECORDS..." 48 | else: 49 | dns_conn = self.dns_connections_index[dest_ip] 50 | 51 | features = OrderedDict() 52 | features["key"] = " ".join(key) 53 | features["number_of_flows"] = con4tuple.get_number_of_flows() 54 | features["average_of_duration"] = con4tuple.get_average_of_duration() 55 | features["standard_deviation_duration"] = con4tuple.get_standard_deviation_duration() 56 | features["percent_of_standard_deviation_duration"] = con4tuple.get_percent_of_standard_deviation_duration() 57 | features["total_size_of_flows_orig"] = con4tuple.get_total_size_of_flows_orig() 58 | features["total_size_of_flows_resp"] = con4tuple.get_total_size_of_flows_resp() 59 | features["ratio_of_sizes"] = con4tuple.get_ratio_of_sizes() 60 | features["percent_of_established_states"] = con4tuple.get_percent_of_established_states() 61 | features["inbound_pckts"] = con4tuple.get_inbound_pckts() 62 | features["outbound_pckts"] = con4tuple.get_outbound_pckts() 63 | features["periodicity_average"] = con4tuple.get_periodicity_average() 64 | features["periodicity_standart_deviation"] = con4tuple.get_periodicity_standart_deviation() 65 | features["ssl_ratio"] = con4tuple.get_ssl_ratio() 66 | features["average_public_key"] = con4tuple.get_average_public_key() 67 | features["tls_version_ratio"] = con4tuple.get_tls_version_ratio() 68 | features["average_of_certificate_length"] = con4tuple.get_average_of_certificate_length() 69 | features["standart_deviation_cert_length"] = con4tuple.get_standart_deviation_cert_length() 70 | features["is_valid_certificate_during_capture"] = con4tuple.is_valid_certificate_during_capture() 71 | features["amount_diff_certificates"] = con4tuple.get_amount_diff_certificates() 72 | features["number_of_domains_in_certificate"] = con4tuple.get_number_of_domains_in_certificate() 73 | features["get_certificate_ratio"] = con4tuple.get_certificate_ratio() 74 | features["number_of_certificate_path"] = con4tuple.get_number_of_certificate_path() 75 | features["x509_ssl_ratio"] = con4tuple.x509_ssl_ratio() 76 | features["SNI_ssl_ratio"] = con4tuple.SNI_ssl_ratio() 77 | features["self_signed_ratio"] = con4tuple.self_signed_ratio() 78 | features["is_SNIs_in_SNA_dns"] = con4tuple.is_SNIs_in_SNA_dns() 79 | features["SNI_equal_DstIP"] = con4tuple.get_SNI_equal_DstIP() 80 | features["is_CNs_in_SNA_dns"] = con4tuple.is_CNs_in_SNA_dns() 81 | 82 | # New features 83 | features["ratio_of_differ_SNI_in_ssl_log"] = con4tuple.ratio_of_differ_SNI_in_ssl_log() 84 | features["ratio_of_differ_subject_in_ssl_log"] = con4tuple.ratio_of_differ_subject_in_ssl_log() 85 | features["ratio_of_differ_issuer_in_ssl_log"] = con4tuple.ratio_of_differ_issuer_in_ssl_log() 86 | features["ratio_of_differ_subject_in_cert"] = con4tuple.ratio_of_differ_subject_in_cert() 87 | features["ratio_of_differ_issuer_in_cert"] = con4tuple.ratio_of_differ_issuer_in_cert() 88 | features["ratio_of_differ_sandns_in_cert"] = con4tuple.ratio_of_differ_sandns_in_cert() 89 | features["ratio_of_same_subjects"] = con4tuple.ratio_of_same_subjects() 90 | features["ratio_of_same_issuer"] = con4tuple.ratio_of_same_issuer() 91 | features["ratio_is_same_CN_and_SNI"] = con4tuple.ratio_is_same_CN_and_SNI() 92 | features["average_certificate_exponent"] = con4tuple.average_certificate_exponent() 93 | features["is_SNI_in_top_level_domain"] = con4tuple.is_SNI_in_top_level_domain() 94 | features["ratio_certificate_path_error"] = con4tuple.ratio_certificate_path_error() 95 | features["ratio_missing_cert_in_cert_path"] = con4tuple.ratio_missing_cert_in_cert_path() 96 | 97 | # DNS Features 98 | features.update(benchmark(dns_conn.compute_alexa_features)) 99 | features["FQDN_length"] = benchmark(dns_conn.get_FQDN_length) 100 | features["domain_name_length"] = benchmark(dns_conn.get_domain_name_length) 101 | features["number_of_numerical_chars"] = benchmark(dns_conn.get_number_of_numerical_chars) 102 | features["number_of_non_alphanumeric_chars"] = benchmark( 103 | dns_conn.get_number_of_non_alphanumeric_chars) 104 | features["number_unique_IP_addresses_in_response"] = benchmark( 105 | dns_conn.get_number_unique_IP_addresses_in_response) 106 | features["number_of_subdomains"] = benchmark(dns_conn.get_number_of_subdomains) 107 | features["average_ttls"] = benchmark(dns_conn.get_average_ttls) 108 | features["std_ttls"] = benchmark(dns_conn.get_std_ttls) 109 | features["min_ttls"] = benchmark(dns_conn.get_min_ttls) 110 | features["max_ttls"] = benchmark(dns_conn.get_max_ttls) 111 | features["number_of_hyphens_in_fqdn"] = benchmark(dns_conn.get_number_of_hyphens_in_fqdn) 112 | features["length_of_longest_subdomain_name"] = benchmark( 113 | dns_conn.get_length_of_longest_subdomain_name) 114 | features["number_of_voyels_in_fqdn"] = benchmark(dns_conn.get_number_of_voyels_in_fqdn) 115 | features["number_of_different_chars_in_fqdn"] = benchmark( 116 | dns_conn.get_number_of_different_chars_in_fqdn) 117 | features["number_of_consonants_in_fqdn"] = benchmark(dns_conn.get_number_of_consonants_in_fqdn) 118 | features["shannon_entropy_2ld"] = benchmark(dns_conn.get_shannon_entropy_2ld) 119 | features["shannon_entropy_3ld"] = benchmark(dns_conn.get_shannon_entropy_3ld) 120 | 121 | features["label"] = con4tuple.get_label_of_connection() 122 | 123 | if con4tuple.is_malware(): 124 | botnet_lines.append(features) 125 | else: 126 | normal_lines.append(features) 127 | 128 | # Shuffle & balance the whole dataset (50-50 botnet/normal traffic)\n 129 | from sklearn.utils import shuffle 130 | 131 | max_sample = min(len(botnet_lines), len(normal_lines)) 132 | 133 | logger.info("Number of Conn3tuples (botnet, normal) : {}".format((len(botnet_lines),len(normal_lines)))) 134 | logger.info("Down-sampling to {} conn4tuples/class".format(max_sample)) 135 | 136 | lines = shuffle(botnet_lines, n_samples=max_sample) + shuffle(normal_lines, n_samples=max_sample) 137 | logger.info("Total dataset lines: {}".format(len(lines))) 138 | 139 | with open(c.model_folder + 'features.csv', 'wb') as csvfile: 140 | writer = csv.DictWriter(csvfile, fieldnames=features.keys(), lineterminator='\n', delimiter=',', 141 | quoting=csv.QUOTE_NONNUMERIC) 142 | writer.writeheader() 143 | writer.writerows(lines) 144 | 145 | def create_dataset_dns(self): 146 | import csv 147 | from collections import OrderedDict 148 | 149 | with open(c.model_folder + 'dns_features.csv', 'wb') as csvfile: 150 | line = 0 151 | 152 | for key, dns_conn in self.dns_connections.iteritems(): 153 | features = OrderedDict() 154 | features["key"] = key 155 | features.update(benchmark(dns_conn.compute_alexa_features)) 156 | features["FQDN_length"] = benchmark(dns_conn.get_FQDN_length) 157 | features["domain_name_length"] = benchmark(dns_conn.get_domain_name_length) 158 | features["number_of_numerical_chars"] = benchmark(dns_conn.get_number_of_numerical_chars) 159 | features["number_of_non_alphanumeric_chars"] = benchmark(dns_conn.get_number_of_non_alphanumeric_chars) 160 | features["number_unique_IP_addresses_in_response"] = benchmark( 161 | dns_conn.get_number_unique_IP_addresses_in_response) 162 | features["number_of_subdomains"] = benchmark(dns_conn.get_number_of_subdomains) 163 | features["average_ttls"] = benchmark(dns_conn.get_average_ttls) 164 | features["std_ttls"] = benchmark(dns_conn.get_std_ttls) 165 | features["min_ttls"] = benchmark(dns_conn.get_min_ttls) 166 | features["max_ttls"] = benchmark(dns_conn.get_max_ttls) 167 | features["number_of_hyphens_in_fqdn"] = benchmark(dns_conn.get_number_of_hyphens_in_fqdn) 168 | features["length_of_longest_subdomain_name"] = benchmark(dns_conn.get_length_of_longest_subdomain_name) 169 | features["number_of_voyels_in_fqdn"] = benchmark(dns_conn.get_number_of_voyels_in_fqdn) 170 | features["number_of_different_chars_in_fqdn"] = benchmark( 171 | dns_conn.get_number_of_different_chars_in_fqdn) 172 | features["number_of_consonants_in_fqdn"] = benchmark(dns_conn.get_number_of_consonants_in_fqdn) 173 | features["shannon_entropy_2ld"] = benchmark(dns_conn.get_shannon_entropy_2ld) 174 | features["shannon_entropy_3ld"] = benchmark(dns_conn.get_shannon_entropy_3ld) 175 | 176 | if line == 0: 177 | writer = csv.DictWriter(csvfile, fieldnames=features.keys(), lineterminator='\n', delimiter=',', quoting=csv.QUOTE_NONNUMERIC) 178 | writer.writeheader() 179 | 180 | writer.writerow(features) 181 | line += 1 182 | 183 | def save_dataset_information(self): 184 | space = ' ' 185 | # with open("ExtractedData\\" + "conn_result.txt", 'w') as f: 186 | with open(c.model_folder + "/dataset_info.txt", 'w') as f: 187 | for key in self.dataset_information_dict.keys(): 188 | f.write(str(key) + space + 189 | str(self.dataset_information_dict[key].ssl_lines) + space + 190 | str(self.dataset_information_dict[key].not_founded_x509_lines) + space + 191 | str(self.dataset_information_dict[key].founded_x509_lines) + space + 192 | str(self.dataset_information_dict[key].err_not_added_x509) + 193 | "\n") 194 | f.close() 195 | 196 | 197 | """ 198 | Statistic methods. 199 | """ 200 | def print_statistic(self): 201 | logger.info("-------------------------------------------") 202 | logger.info("----------- Statistic ---------------------") 203 | logger.info("-------------------------------------------") 204 | malware_certificates_array = [] 205 | 206 | normal_tuples = 0 207 | malware_tuples = 0 208 | flows_together = 0 209 | flows_normal = 0 210 | flows_malware = 0 211 | cert_together = 0 212 | cert_normal = 0 213 | cert_malware = 0 214 | for tuple_key in self.connection_4_tuples.keys(): 215 | conn_tuple = self.connection_4_tuples[tuple_key] 216 | flows_together += conn_tuple.get_number_of_ssl_flows() 217 | cert_together += len(conn_tuple.get_certificate_serial_dict().keys()) 218 | # More normal labels and malware labels in one 4-tuple ? 219 | if conn_tuple.get_malware_label() != 0 and conn_tuple.get_normal_label() != 0: 220 | logger.error("Error: More labels in one 4-tuples") 221 | # Same amout of labels in one 4-tuple? 222 | if conn_tuple.get_malware_label() == conn_tuple.get_normal_label(): 223 | logger.warning("Watch out: same amount of labels") 224 | logger.warning("Normal: {}".format(conn_tuple.get_normal_label())) 225 | logger.warning("Malware: {}".format(conn_tuple.get_malware_label())) 226 | 227 | if conn_tuple.is_malware(): 228 | malware_tuples += 1 229 | flows_malware += conn_tuple.get_number_of_ssl_flows() 230 | cert_malware += len(conn_tuple.get_certificate_serial_dict().keys()) 231 | 232 | malware_certificates_array += conn_tuple.get_x509_list() 233 | else: 234 | normal_tuples += 1 235 | flows_normal += conn_tuple.get_number_of_ssl_flows() 236 | cert_normal += len(conn_tuple.get_certificate_serial_dict().keys()) 237 | 238 | logger.info("Connection 4-tuples:") 239 | logger.info("All 4_tuples: {}".format(len(self.connection_4_tuples.keys()))) 240 | logger.info("Normal 4-tuples: {}".format(normal_tuples)) 241 | logger.info("Malware 4-tuples: {}".format(malware_tuples)) 242 | 243 | logger.info("Flows") 244 | logger.info("All gathered flows: {}".format(flows_together)) 245 | logger.info("Normal flows: {}".format(flows_normal)) 246 | logger.info("Malware flows: {}".format(flows_malware)) 247 | 248 | logger.info("Certificates") 249 | logger.info("All gathered certificates: {}".format(cert_together)) 250 | logger.info("Normal certificates: {}".format(cert_normal)) 251 | logger.info("Malware certificates: {}".format(cert_malware)) 252 | 253 | # Save malware certificates. 254 | self.save_malware_certificates(malware_certificates_array) 255 | 256 | def save_malware_certificates(self, x509_lines): 257 | with open(c.model_folder + '/malware_certificates', 'w') as f: 258 | for line in x509_lines: 259 | f.write(line + "\n") 260 | f.close() 261 | 262 | def benchmark(func, *params): 263 | #import datetime 264 | #import time 265 | #start_time = time.time() 266 | return_value = func(*params) if params else func() 267 | #total_time = datetime.timedelta(seconds=time.time() - start_time) 268 | #print("Function " + func.__name__ + " - execution time : " + str(total_time))#.strftime('%H:%M:%S')) 269 | return return_value -------------------------------------------------------------------------------- /features_extraction/ConnectionFeatures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from Connection4tuple import Connection4tuple 3 | 4 | 5 | class ConnectionFeatures(Connection4tuple): 6 | 7 | def __init__(self, tuple_index): 8 | super(ConnectionFeatures, self).__init__(tuple_index) 9 | 10 | """ 11 | ---------- Get Feature ------------------- 12 | """ 13 | # --------------------------------------------------- 14 | # 01. ---------- Number of flows -------------------- 15 | def get_number_of_flows(self): 16 | return self.get_number_of_ssl_flows() + self.get_number_of_not_ssl_flows() 17 | 18 | # --------------------------------------------------- 19 | # ---------- Duration of flows ---------------------- 20 | # 02. Average 21 | def get_average_of_duration(self): 22 | # self.check_zero_dividing(self.flow_which_has_duration_number, "flow_which_has_duration_number is 0 !!!") 23 | if self.flow_which_has_duration_number != 0: 24 | return self.average_duration / float(self.flow_which_has_duration_number) 25 | return -1 26 | 27 | # 03. Standard deviation 28 | def get_standard_deviation_duration(self): 29 | # self.check_zero_dividing(self.flow_which_has_duration_number, "flow_which_has_duration_number is 0 !!!") 30 | # EX = self.average_duration / float(self.flow_which_has_duration_number) 31 | # EX2 = self.average_duration_power / float(self.flow_which_has_duration_number) # E(X^2) 32 | # DX = EX2 - EX*EX 33 | # return pow(DX, 0.5) 34 | if len(self.duration_list) != 0 and len(self.duration_list) > 2: 35 | return np.std(self.duration_list) 36 | return -1 37 | 38 | # 04. Percent of flows which are bigger or less than standard deviation with average 39 | def get_percent_of_standard_deviation_duration(self): 40 | # self.check_zero_dividing(self.flow_which_has_duration_number, "flow_which_has_duration_number is 0 !!!") 41 | if len(self.duration_list) != 0: 42 | out_of_bounds = 0 43 | lower_level = self.get_average_of_duration() - self.get_standard_deviation_duration() 44 | upper_level = self.get_average_of_duration() + self.get_standard_deviation_duration() 45 | for i in range(len(self.duration_list)): 46 | if self.duration_list[i] < lower_level: 47 | out_of_bounds += 1 48 | elif self.duration_list[i] > upper_level: 49 | out_of_bounds += 1 50 | 51 | return out_of_bounds / float(self.flow_which_has_duration_number) 52 | return -1 53 | 54 | # ------------------------------------------------------------------- 55 | # 05 -------- Total payload size of flows the originator sent -------- 56 | def get_total_size_of_flows_orig(self): 57 | return self.total_size_of_flows_orig 58 | 59 | # ------------------------------------------------------------------ 60 | # 06 -------- Total payload size of flows the responder sent -------- 61 | def get_total_size_of_flows_resp(self): 62 | return self.total_size_of_flows_resp 63 | 64 | # --------------------------------------------------------------------------- 65 | # 07 ------ Ratio of responder payload sizes and originator payload sizes ---- 66 | def get_ratio_of_sizes(self): 67 | # self.check_zero_dividing(self.total_size_of_flows_orig, "Original size is 0 !!!") 68 | if self.total_size_of_flows_orig != 0: 69 | return self.total_size_of_flows_resp / float(self.total_size_of_flows_orig) 70 | return -1 71 | 72 | # -------------------------------------------------------------------- 73 | # ------ State of connection ----------------------------------------- 74 | # 08 Percent of established connection 75 | def get_percent_of_established_states(self): 76 | establihed_states = 0 77 | total_value_states = 0 78 | for key in self.state_of_connection_dict.keys(): 79 | total_value_states += self.state_of_connection_dict[key] 80 | if total_value_states != 0: 81 | establihed_states += self.state_of_connection_dict.get('SF', 0) 82 | establihed_states += self.state_of_connection_dict.get('S1', 0) 83 | establihed_states += self.state_of_connection_dict.get('S2', 0) 84 | establihed_states += self.state_of_connection_dict.get('S3', 0) 85 | establihed_states += self.state_of_connection_dict.get('RSTO', 0) # delete this 86 | establihed_states += self.state_of_connection_dict.get('RSTR', 0) # delete this 87 | return (establihed_states / float(total_value_states)) 88 | return -1 89 | 90 | """ 91 | These functions are not used. 92 | """ 93 | # 09 - return 4 items 94 | # def get_based_states_ratio(self): 95 | # SF_S1 = self.state_of_connection_dict['SF'] + self.state_of_connection_dict['S1'] 96 | # S0 = self.state_of_connection_dict['S0'] 97 | # OTH = self.state_of_connection_dict['OTH'] 98 | # REJ = self.state_of_connection_dict['REJ'] 99 | # biggest = max(SF_S1, S0, OTH, REJ) / 100.0 100 | # return SF_S1 / float(biggest), S0 / float(biggest), OTH / float(biggest), REJ / float(biggest) 101 | # 102 | # # 10 - return 6 items 103 | # def get_extended_states_ratio(self): 104 | # SF_S1 = self.state_of_connection_dict['SF'] + self.state_of_connection_dict['S1'] 105 | # S0 = self.state_of_connection_dict['S0'] 106 | # OTH = self.state_of_connection_dict['OTH'] 107 | # REJ = self.state_of_connection_dict['REJ'] 108 | # RSTO_1 = self.state_of_connection_dict['RSTO'] + self.state_of_connection_dict['RSTR'] + self.state_of_connection_dict['S2'] + self.state_of_connection_dict['S3'] 109 | # RSTO_2 = self.state_of_connection_dict['RSTOS0'] + self.state_of_connection_dict['RSTRH'] + self.state_of_connection_dict['SH'] + self.state_of_connection_dict['SHR'] 110 | # biggest = max(SF_S1, S0, OTH, REJ, RSTO_1, RSTO_2) / 100.0 111 | # return SF_S1 / float(biggest), S0 / float(biggest), OTH / float(biggest), REJ / float(biggest), RSTO_1 / float(biggest), RSTO_2 / float(biggest) 112 | 113 | # 11 inbound packets == resp_pkts (18) 114 | # Number of packets that the responder sent. 115 | def get_inbound_pckts(self): 116 | return self.inbound_packtes 117 | 118 | # 12 outbound packets == orig_pkts (16) 119 | def get_outbound_pckts(self): 120 | return self.outbound_packtes 121 | 122 | # Periodicity 123 | # 13 Average of periodicity 124 | def get_periodicity_average(self): 125 | per_list = self.get_periodicity_list() 126 | sum = 0 127 | for i in range(len(per_list)): 128 | sum += per_list[i] 129 | if len(per_list) != 0: 130 | return sum / float(len(per_list)) 131 | # print "periodicity list is zero. Number of flows:", self.get_number_of_flows() 132 | return -1 133 | 134 | # 14 135 | def get_periodicity_standart_deviation(self): 136 | per_list = self.get_periodicity_list() 137 | if len(per_list) != 0 and len(per_list) > 2: 138 | # sum = 0 139 | # for i in range(len(per_list)): 140 | # sum += pow(per_list[i], 2) 141 | # EX2 = sum / float(len(per_list)) 142 | # DX = EX2 - EX * EX 143 | # return pow(DX, 0.5) 144 | return np.std(self.get_periodicity_list()) 145 | return -1 146 | 147 | # ----------------------------------------------------- 148 | # 15 ------ Ratio of not ssl flows and ssl flows ------- 149 | def get_ssl_ratio(self): 150 | self.check_zero_dividing(len(self.ssl_flow_list), "Original size is 0 !!!") 151 | return len(self.not_ssl_flow_list) / float(len(self.ssl_flow_list)) 152 | 153 | # 16 Average Public key lenghts 154 | # certificate feature 155 | def get_average_public_key(self): 156 | total = 0 157 | index = 0 158 | for key in self.certificate_key_length_dict.keys(): 159 | total += self.certificate_key_length_dict[key] * int(key) 160 | index += 1 161 | if index != 0: 162 | return total / float(index) 163 | return -1 164 | 165 | # ------------------------------------------------------ 166 | # 17 Version of ssl ratio 167 | def get_tls_version_ratio(self): 168 | tls = 0 169 | ssl = 0 170 | total = 0 171 | for key in self.version_of_ssl_dict.keys(): 172 | if 'tls' in key.lower(): 173 | tls += self.version_of_ssl_dict[key] 174 | elif 'ssl' in key.lower(): 175 | ssl += self.version_of_ssl_dict[key] 176 | total += self.version_of_ssl_dict[key] 177 | if total != 0: 178 | return tls / float(total) 179 | return -1 180 | 181 | # ---------------------------------------------- 182 | # Certificate validation length 183 | # 18 Average of certificate length 184 | # certificate_valid_length = sum of certificate valid length in days 185 | # certificate_valid_number = number of certificate* 186 | def get_average_of_certificate_length(self): 187 | # self.check_zero_dividing(self.certificate_valid_number, "certificate_valid_number is 0 !!!") 188 | if self.certificate_valid_number != 0: 189 | if np.mean(self.temp_list) != self.certificate_valid_length / float(self.certificate_valid_number): 190 | print "Error: numpy mean and mean by hand are not same." 191 | return self.certificate_valid_length / float(self.certificate_valid_number) 192 | return -1 193 | 194 | # 19 195 | def get_standart_deviation_cert_length(self): 196 | # self.check_zero_dividing(self.certificate_valid_number, "certificate_valid_number is 0 !!!") 197 | if self.certificate_valid_number != 0: 198 | EX = self.certificate_valid_length / self.certificate_valid_number 199 | EX2 = self.certificate_valid_length_pow / self.certificate_valid_number 200 | DX = EX2 - (EX * EX) 201 | # if DX < 0: 202 | # print "EX:", (EX*EX) 203 | # print "EX2:", EX2 204 | # print "DX:", DX 205 | # print self.temp_list 206 | # print "std:", numpy.std(self.temp_list) 207 | # print len(self.x509_list) 208 | return pow(DX, 0.5) 209 | return -1 210 | 211 | # --------------------------------------------- 212 | # 20 Validity of the certificate during the capture 213 | # certificate feature 214 | # 0 == no certficate was out of validity range 215 | def is_valid_certificate_during_capture(self): 216 | if len(self.cert_percent_validity) != 0: 217 | return self.not_valid_certificate_number 218 | return -1 219 | 220 | # 21 Amount of different certificates 221 | # certificate feature 222 | def get_amount_diff_certificates(self): 223 | return len(self.certificate_serial_dict.keys()) 224 | 225 | # ------------------------------------------------------- 226 | # 22 Number of domains in certificate 227 | # certificate feature 228 | def get_number_of_domains_in_certificate(self): 229 | if self.number_san_domains_index != 0: 230 | return self.number_san_domains / float(self.number_san_domains_index) 231 | return -1 232 | 233 | # 23 Certificate ratio 234 | # certificate feature 235 | # List of length of certificate validity length. 236 | def get_certificate_ratio(self): 237 | if len(self.cert_percent_validity) != 0: 238 | temp = 0 239 | for value in self.cert_percent_validity: 240 | temp += value 241 | return temp / float(len(self.cert_percent_validity)) 242 | else: 243 | return -1 244 | 245 | # 24 Certificate path 246 | # number of signed certificate in our first certificate 247 | # It is EX (vazeny prumer) 248 | def get_number_of_certificate_path(self): 249 | up = 0 250 | down = 0 251 | for key in self.certificate_path.keys(): 252 | up += int(key) * self.certificate_path[key] 253 | down += self.certificate_path[key] 254 | if down != 0: 255 | return up/float(down) 256 | return -1 257 | 258 | # 25 x509/ssl ratio 259 | # ratio about how many ssl log has x509 information in this connection 260 | def x509_ssl_ratio(self): 261 | if len(self.ssl_logs_list) == 0: 262 | return -1 263 | return len(self.x509_list) / float(len(self.ssl_logs_list)) 264 | 265 | # 26 SNI and SSL ratio 266 | # ratio, how many ssl flows have SNI (server name) 267 | def SNI_ssl_ratio(self): 268 | return self.ssl_with_SNI / float(len(self.ssl_logs_list)) 269 | 270 | # 27 Self_signed cert and all cert ratio 271 | def self_signed_ratio(self): 272 | # number_of_certificate = len(self.certificate_serial_dict.keys()) 273 | if len(self.ssl_logs_list) != 0: 274 | return self.self_signed_cert / float(len(self.ssl_logs_list)) 275 | return -1 276 | 277 | # 28 Is there any SNI, which not in san.dns ? 278 | def is_SNIs_in_SNA_dns(self): 279 | if len(self.is_SNI_in_san_dns) != 0: 280 | for a in self.is_SNI_in_san_dns: 281 | if a == 0: 282 | return 0 283 | return 1 284 | return -1 285 | 286 | # 29 if SNI is IP, so dst is same ip? 287 | def get_SNI_equal_DstIP(self): 288 | return self.SNI_equal_DstIP 289 | 290 | # 30 Is there any CN, which not in san.dns ? 291 | def is_CNs_in_SNA_dns(self): 292 | if len(self.is_CN_in_SAN_list) != 0: 293 | for a in self.is_CN_in_SAN_list: 294 | if a == 0: 295 | return 0 296 | return 1 297 | return -1 298 | 299 | 300 | """ 301 | ----------------- New Features ------------------ 302 | """ 303 | # 31 How many ssl lines has different SNI ? 304 | def ratio_of_differ_SNI_in_ssl_log(self): 305 | # Delete stars. 306 | for i in range(0, len(self.SNI_list)): 307 | if '*' in self.SNI_list[i]: 308 | self.SNI_list[i] = self.SNI_list[i].replace('*', '') 309 | 310 | return compute_differents_in_lines(self.SNI_list) 311 | 312 | # 32 How many ssl lines has different subject 313 | def ratio_of_differ_subject_in_ssl_log(self): 314 | return compute_differents_in_lines(self.subject_ssl_list) 315 | 316 | # 33 How many ssl lines has differ issuer 317 | def ratio_of_differ_issuer_in_ssl_log(self): 318 | return compute_differents_in_lines(self.issuer_ssl_list) 319 | 320 | # 34 How many cert has differ subject 321 | def ratio_of_differ_subject_in_cert(self): 322 | return compute_differents_in_lines(self.subject_x509_list) 323 | 324 | # 35 How many cert has differ issuer 325 | def ratio_of_differ_issuer_in_cert(self): 326 | return compute_differents_in_lines(self.issuer_x509_list) 327 | 328 | # 36 How many cert has differ san dns 329 | def ratio_of_differ_sandns_in_cert(self): 330 | return compute_differents_in_lines(self.san_x509_list) 331 | 332 | # 37 Do ssl and x509 lines have same subjects? 333 | def ratio_of_same_subjects(self): 334 | if len(self.x509_list) == 0: 335 | return -1 336 | return self.subject_diff / float(len(self.x509_list)) 337 | 338 | # 38 Do ssl and x509 lines have same issuer? 339 | def ratio_of_same_issuer(self): 340 | if len(self.x509_list) == 0: 341 | return -1 342 | return self.issuer_diff / float(len(self.x509_list)) 343 | 344 | # 39 Is SNI and CN same? 345 | def ratio_is_same_CN_and_SNI(self): 346 | if len(self.x509_list) == 0: 347 | return -1 348 | return self.SNI_is_in_CN / float(len(self.x509_list)) 349 | 350 | # 40 Certificate exponent average 351 | def average_certificate_exponent(self): 352 | if len(self.certificate_serial_dict.keys()) == 0: 353 | return -1 354 | return self.certificate_exponent / float(len(self.certificate_serial_dict.keys())) 355 | 356 | # 41 Is server name in top-level-domain ? 357 | def is_SNI_in_top_level_domain(self): 358 | if self.ssl_with_SNI == 0: 359 | return -1 360 | return self.top_level_domain_error / float(self.ssl_with_SNI) 361 | 362 | # 42 Is certificate path right ? (issuer of first certificate is subject in second cert...) 363 | def ratio_certificate_path_error(self): 364 | if len(self.ssl_logs_list): 365 | return -1 366 | return self.certificate_path_error / float(len(self.ssl_logs_list)) 367 | 368 | # 43 Missing certificate in certificate path. 369 | def ratio_missing_cert_in_cert_path(self): 370 | if len(self.ssl_logs_list): 371 | return -1 372 | return self.missing_cert_in_cert_path / float(len(self.ssl_logs_list)) 373 | 374 | 375 | """ 376 | ------- Computation method --------- 377 | """ 378 | def compute_differents_in_lines(array): 379 | _dict = dict() 380 | for item in array: 381 | try: 382 | _dict[item] += 1 383 | except: 384 | _dict[item] = 1 385 | 386 | if len(array) == 0: 387 | return -1.0 388 | if len(_dict.keys()) == 1: 389 | return 0.0 390 | return len(_dict.keys()) / float(len(array)) -------------------------------------------------------------------------------- /features_extraction/DNSConnection.py: -------------------------------------------------------------------------------- 1 | """ 2 | This class stores all information for DNS records that have the same Domain name => called one DNSConnection 3 | """ 4 | 5 | 6 | class DNSConnection(object): 7 | 8 | def __init__(self, FQDN): 9 | self.FQDN = FQDN 10 | self.subdomains = self.FQDN.split('.') 11 | self.domain_name = '.'.join(self.subdomains[-2:]) 12 | self.dns_records = list() 13 | self.ttls = list() 14 | self.answers = set() -------------------------------------------------------------------------------- /features_extraction/DNSFeatures.py: -------------------------------------------------------------------------------- 1 | from DNSConnection import DNSConnection 2 | import string 3 | import csv 4 | from collections import OrderedDict 5 | import config as c 6 | import numpy as np 7 | 8 | 9 | class DNSFeatures(DNSConnection): 10 | alexa_top100 = list() 11 | alexa_top1k = list() 12 | alexa_top10k = list() 13 | alexa_top100k = list() 14 | alexa_top1m = list() 15 | 16 | def __init__(self, index): 17 | super(DNSFeatures, self).__init__(index) 18 | 19 | @staticmethod 20 | def get_alexa(filename): 21 | with open(filename, 'rb') as csvfile: 22 | csvreader = csv.reader(csvfile, delimiter=' ', quoting=csv.QUOTE_MINIMAL) 23 | return csvreader.next() 24 | 25 | @staticmethod 26 | def load_all_top_alexa(): 27 | DNSFeatures.alexa_top100 = DNSFeatures.get_alexa(c.alexa_folder + "alexa_top100.csv") 28 | DNSFeatures.alexa_top1k = DNSFeatures.get_alexa(c.alexa_folder + "alexa_top1k.csv") 29 | DNSFeatures.alexa_top10k = DNSFeatures.get_alexa(c.alexa_folder + "alexa_top10k.csv") 30 | DNSFeatures.alexa_top100k = DNSFeatures.get_alexa(c.alexa_folder + "alexa_top100k.csv") 31 | DNSFeatures.alexa_top1m = DNSFeatures.get_alexa(c.alexa_folder + "alexa_top1m.csv") 32 | 33 | 34 | 35 | def add_dns_record(self, dns_record): 36 | self.dns_records.append(dns_record) 37 | self.compute_classic_features(dns_record) 38 | 39 | def compute_classic_features(self, dns_record): 40 | 41 | if dns_record["answers"] != '-': 42 | self.answers.update(filter(is_ipv4, dns_record["answers"].split(','))) 43 | if dns_record["TTLs"] != '-': 44 | self.ttls += (map(float, dns_record["TTLs"].split(','))) 45 | 46 | 47 | 48 | ############ 49 | # Features # 50 | ############ 51 | 52 | ############ Anderson 53 | 54 | # ----------------------------------------------- 55 | # 00. ---------- FQDN Length -------------------- 56 | def get_FQDN_length(self): 57 | return len(self.FQDN) 58 | 59 | # ----------------------------------------------- 60 | # 00. ---------- Domain name Length -------------------- 61 | def get_domain_name_length(self): 62 | return len(self.domain_name) 63 | 64 | # ------------------------------------------------------------------ 65 | # 00. ---------- number of numerical characters -------------------- 66 | def get_number_of_numerical_chars(self): 67 | return len(filter(lambda c: c in string.digits, self.FQDN)) 68 | 69 | # ------------------------------------------------------------------ 70 | # 00. ---------- number of non-alphanumeric characters -------------------- 71 | def get_number_of_non_alphanumeric_chars(self): 72 | alpha = string.ascii_letters + string.digits 73 | return len(filter(lambda c: c not in alpha and c != '.', self.FQDN)) 74 | 75 | # ------------------------------------------------------------------ 76 | # 00. ---------- alexa features -------------------- 77 | def compute_alexa_features(self): 78 | alexa_features = OrderedDict() 79 | 80 | alexa_features["in_alexa_top100"] = 0 81 | alexa_features["in_alexa_top1k"] = 0 82 | alexa_features["in_alexa_top10k"] = 0 83 | alexa_features["in_alexa_top100k"] = 0 84 | alexa_features["in_alexa_top1m"] = 0 85 | alexa_features["not_in_alexa"] = 0 86 | 87 | if binarySearch(DNSFeatures.alexa_top100, self.domain_name): 88 | alexa_features["in_alexa_top100"] = 1 89 | elif binarySearch(DNSFeatures.alexa_top1k, self.domain_name): 90 | alexa_features["in_alexa_top1k"] = 1 91 | elif binarySearch(DNSFeatures.alexa_top10k, self.domain_name): 92 | alexa_features["in_alexa_top10k"] = 1 93 | elif binarySearch(DNSFeatures.alexa_top100k, self.domain_name): 94 | alexa_features["in_alexa_top100k"] = 1 95 | elif binarySearch(DNSFeatures.alexa_top1m, self.domain_name): 96 | alexa_features["in_alexa_top1m"] = 1 97 | else: 98 | alexa_features["not_in_alexa"] = 1 99 | return alexa_features 100 | 101 | ######### Mine 102 | 103 | # ------------------------------------------------------------------ 104 | # 00. ---------- number of unique IP addresses in response -------------------- 105 | def get_number_unique_IP_addresses_in_response(self): 106 | return len(self.answers) 107 | 108 | # ------------------------------------------------------------------ 109 | # 00. ---------- number of subdomains -------------------- 110 | def get_number_of_subdomains(self): 111 | return len(self.FQDN.split('.')) 112 | 113 | # ------------------------------------------------------------------ 114 | # 00. ---------- average TTLs -------------------- 115 | def get_average_ttls(self): 116 | if len(self.ttls) > 0: 117 | return sum(self.ttls) / len(self.ttls) 118 | else: 119 | return -1 120 | 121 | # ------------------------------------------------------------------ 122 | # 00. ---------- std TTLs -------------------- 123 | def get_std_ttls(self): 124 | if len(self.ttls) > 2: 125 | return np.std(self.ttls) 126 | else: 127 | return -1 128 | 129 | # ------------------------------------------------------------------ 130 | # 00. ---------- min TTLs -------------------- 131 | def get_min_ttls(self): 132 | return min(self.ttls) if len(self.ttls) > 0 else -1 133 | 134 | # ------------------------------------------------------------------ 135 | # 00. ---------- max TTLs -------------------- 136 | def get_max_ttls(self): 137 | return max(self.ttls) if len(self.ttls) > 0 else -1 138 | 139 | # ------------------------------------------------------------------ 140 | # 00. ---------- number of hyphens in fqdn-------------------- 141 | def get_number_of_hyphens_in_fqdn(self): 142 | return len(filter(lambda c: c == "-", self.FQDN)) 143 | 144 | # ------------------------------------------------------------------ 145 | # 00. ---------- length of the longest subdomain name-------------------- 146 | def get_length_of_longest_subdomain_name(self): 147 | return max(map(len, self.FQDN.split('.'))) 148 | 149 | # ------------------------------------------------------------------ 150 | # 00. ---------- number of voyels -------------------- 151 | def get_number_of_voyels_in_fqdn(self): 152 | voyels = "aeioue" 153 | return len(filter(lambda c: c in voyels, self.FQDN)) 154 | 155 | # ------------------------------------------------------------------ 156 | # 00. ---------- number of different chars in fqdn -------------------- 157 | def get_number_of_different_chars_in_fqdn(self): 158 | chars = set() 159 | for c in self.FQDN: 160 | if c != ".": 161 | chars.add(c) 162 | return len(chars) 163 | 164 | # ------------------------------------------------------------------ 165 | # 00. ---------- number of consonants -------------------- 166 | def get_number_of_consonants_in_fqdn(self): 167 | consonants = "zrtypqsdfghjklmwxcvbn" 168 | return len(filter(lambda c: c in consonants, self.FQDN)) 169 | 170 | # ------------------------------------------------------------------ 171 | # 00. ---------- shannon entropy on 2ld -------------------- 172 | def get_shannon_entropy_2ld(self): 173 | try: 174 | ent = entropy(self.subdomains[-2]) 175 | except IndexError: 176 | print self.FQDN 177 | print self.subdomains 178 | raise 179 | return ent 180 | 181 | # ------------------------------------------------------------------ 182 | # 00. ---------- shannon entropy on 3ld -------------------- 183 | def get_shannon_entropy_3ld(self): 184 | if len(self.subdomains) > 2: 185 | return entropy(self.subdomains[-3]) 186 | else: 187 | return -1 188 | 189 | 190 | 191 | # UTILITIES 192 | 193 | def binarySearch(alist, item): 194 | first = 0 195 | last = len(alist)-1 196 | found = False 197 | 198 | while first<=last and not found: 199 | pos = 0 200 | midpoint = (first + last)//2 201 | if alist[midpoint] == item: 202 | pos = midpoint 203 | found = True 204 | else: 205 | if item < alist[midpoint]: 206 | last = midpoint-1 207 | else: 208 | first = midpoint+1 209 | return found 210 | 211 | 212 | def entropy(str): 213 | import math 214 | "Calculates the Shannon entropy of a string" 215 | 216 | # get probability of chars in string 217 | prob = [float(str.count(c)) / len(str) for c in dict.fromkeys(list(str))] 218 | 219 | # calculate the entropy 220 | entropy = - sum([p * math.log(p) / math.log(2.0) for p in prob]) 221 | 222 | return entropy 223 | 224 | def is_ipv4(str): 225 | l = str.split('.') 226 | if len(l) != 4: 227 | return False 228 | try: 229 | ip = map(int, l) 230 | except ValueError: 231 | return False 232 | if len(filter(lambda x: 0 <= x <= 255, ip)) == 4: 233 | return True 234 | return False -------------------------------------------------------------------------------- /features_extraction/DatasetInformation.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class DatasetInformation: 4 | 5 | def __init__(self, ssl_lines, not_founded_x509_lines, err_not_added_x509, founded_x509_lines): 6 | self.ssl_lines = ssl_lines 7 | self.not_founded_x509_lines = not_founded_x509_lines 8 | self.founded_x509_lines = founded_x509_lines 9 | self.err_not_added_x509 = err_not_added_x509 10 | -------------------------------------------------------------------------------- /features_extraction/ExtractFeatures.py: -------------------------------------------------------------------------------- 1 | import os 2 | from ConnectionFeatures import ConnectionFeatures 3 | from DatasetInformation import DatasetInformation 4 | from CertificateFeatures import CertificateFeatures 5 | from DNSFeatures import DNSFeatures 6 | 7 | from logger import get_logger 8 | logger = get_logger('debug') 9 | 10 | 11 | class ExtractFeatures(object): 12 | 13 | def __init__(self): 14 | self.connection_4_tuples = dict() 15 | 16 | self.x509_dict = dict() 17 | self.control_ssl_uids_dict = dict() 18 | 19 | self.number_conn_lines = 0 20 | self.conn_dict = dict() 21 | 22 | self.err_conn_uids = 0 23 | self.err_more_same_X509 = 0 24 | self.err_not_added_x509 = 0 25 | 26 | self.ssl_lines = 0 27 | self.not_founded_x509_lines = 0 28 | self.founded_x509_lines = 0 29 | 30 | self.certificate_dict = dict() 31 | 32 | self.dataset_information_dict = dict() 33 | 34 | self.dns_lines = 0 35 | self.dns_connections = dict() 36 | self.dns_connections_index = dict() # keys = IPs, values = dns_connections Note : multiple keys can refer to the same dns_connection 37 | 38 | def extraction_manager(self, dataset_path_to_logs): 39 | # Loads all conn logs in bro folder. 40 | self.conn_logs(dataset_path_to_logs) 41 | # Loads all x509 logs in bro folder. 42 | self.x509_logs(dataset_path_to_logs) 43 | # Load all ssl logs. 44 | self.ssl_logs(dataset_path_to_logs) 45 | # Find not ssl lines in conn.logs that belong to created conn 4 tuples. 46 | self.conn_logs_2(dataset_path_to_logs) 47 | # Load all dns logs. 48 | self.dns_logs(dataset_path_to_logs) 49 | 50 | logger.info("SSL Lines: {}".format(self.ssl_lines)) 51 | logger.info("Not founded x509 lines: {}".format(self.not_founded_x509_lines)) 52 | logger.info("Not '-' x509 lines: {}".format(self.err_not_added_x509)) 53 | logger.info("Founded x509 lines: {}".format(self.founded_x509_lines)) 54 | 55 | dataset_info = DatasetInformation(self.ssl_lines, self.not_founded_x509_lines, self.err_not_added_x509, self.founded_x509_lines) 56 | self.dataset_information_dict[dataset_path_to_logs] = dataset_info 57 | 58 | self.ssl_lines = 0 59 | self.not_founded_x509_lines = 0 60 | self.founded_x509_lines = 0 61 | self.err_not_added_x509 = 0 62 | 63 | """ 64 | ---------------------- Conn logs. ------------------------- 65 | """ 66 | def conn_logs(self, dataset_path_to_logs): 67 | logger.info("loading conn logs...") 68 | print " << Read all conn logs:" 69 | print "Reading conn logs:" 70 | self.number_conn_lines = 0 71 | all_conn_logs = get_such_logs(dataset_path_to_logs, ['conn', '_label']) 72 | for conn_log in all_conn_logs: 73 | self.read_conn_log(dataset_path_to_logs + conn_log) 74 | print " << Loaded conn logs: ", len(all_conn_logs) 75 | 76 | def read_conn_log(self, dataset_path_to_conn): 77 | try: 78 | with open(dataset_path_to_conn) as f: 79 | for line in f: 80 | if line[0] == '#': 81 | continue 82 | split_conn_line = line.split('\t') 83 | conn_uid = split_conn_line[1] 84 | 85 | if len(split_conn_line) < 22: 86 | continue 87 | 88 | label = split_conn_line[21] 89 | 90 | if 'Background' in label or 'No_Label' in label: 91 | continue 92 | 93 | try: 94 | if self.conn_dict[conn_uid]: 95 | print "Error: more same conn line !" 96 | except: 97 | self.conn_dict[conn_uid] = line 98 | 99 | if "#close" in line: 100 | break 101 | 102 | f.close() 103 | except IOError: 104 | logger.error("Error: The conn file: {} does not exist.".format(dataset_path_to_conn)) 105 | 106 | """ 107 | --------------------- X509 logs. ------------------------ 108 | """ 109 | def x509_logs(self, dataset_path_to_logs): 110 | logger.info("loading x509 logs...") 111 | print "<< Read all x509 logs:" 112 | # Clear x509_dict() 113 | self.x509_dict = dict() 114 | all_x509_logs = get_such_logs(dataset_path_to_logs, ['x509']) 115 | print "num x509 logs:", len(all_x509_logs) 116 | for x509_log in all_x509_logs: 117 | self.read_x509_log(dataset_path_to_logs, x509_log) 118 | print " << Loaded x509 logs: ", len(all_x509_logs) 119 | 120 | def read_x509_log(self, dataset_path_to_logs, x509_log): 121 | """ 122 | Read started_file.txt where is time when capture of this dataset starts. Some datasets have starting 123 | time 1.1. 1970 00:00:00. So we have to add to time. 124 | If this file does not exist, dataset has right value time. 125 | """ 126 | # go to parent folder, because 'started_file.txt' is saved in sub folder. Not in bro folder. 127 | sub_folder = os.path.dirname(dataset_path_to_logs) 128 | started_unix_time = 0.0 129 | try: 130 | with open(sub_folder + "/start_date.txt") as f: 131 | started_unix_time = float(f.readlines()[1]) 132 | print " << Started unix time file was read in:", sub_folder 133 | f.close() 134 | except IOError: 135 | # It means that this dataset has right time format. 136 | pass 137 | 138 | try: 139 | with open(dataset_path_to_logs + x509_log) as f: 140 | # go throw ssl file line by line and for each ssl line check all uid of flows 141 | for line in f: 142 | if '#' == line[0]: 143 | continue 144 | x509_split = line.split(' ') 145 | 146 | """ 147 | Change time, because some datasets are from 1.1 1970 00:00:00. 148 | """ 149 | time_new = float(x509_split[0]) + started_unix_time 150 | new_line = str(time_new) 151 | for i in range(1, len(x509_split)): 152 | new_line += ' ' + x509_split[i] 153 | x509_uid = x509_split[1] 154 | try: 155 | if self.x509_dict[x509_uid]: 156 | self.err_more_same_X509 += 1 157 | # print "Error: [read_x509_log] more uids in x509!!!", x509_uid,\ 158 | # " and path is: " + dataset_path_to_logs + x509_log 159 | except: 160 | self.x509_dict[x509_uid] = new_line 161 | f.close() 162 | except IOError: 163 | logger.error("Error: The x509 file: " + dataset_path_to_logs + x509_log + " does not exist.") 164 | 165 | """ 166 | --------------------- SSL logs. ------------------------ 167 | """ 168 | def ssl_logs(self, dataset_path_to_logs): 169 | print "<< Read all ssl logs::" 170 | self.control_ssl_uids_dict = dict() 171 | all_ssl_logs = get_such_logs(dataset_path_to_logs, ['ssl']) 172 | for ssl_log in all_ssl_logs: 173 | self.create_4_tuples(dataset_path_to_logs + ssl_log) 174 | print " << Loaded ssl logs: ", len(all_ssl_logs) 175 | 176 | def create_4_tuples(self, path_to_ssl_log): 177 | 178 | with open(path_to_ssl_log) as ssl_file: 179 | for ssl_line in ssl_file: 180 | if '#' == ssl_line[0]: 181 | continue 182 | 183 | ssl_split = ssl_line.split(' ') 184 | ssl_uid = ssl_split[1] 185 | 186 | # if same ssl, continue (in some ssl.log files are more same ssl lines. It is probably bro error) 187 | try: 188 | if self.control_ssl_uids_dict[ssl_uid]: 189 | if ssl_line == self.control_ssl_uids_dict[ssl_uid]: 190 | continue 191 | else: 192 | old_ssl_split = self.control_ssl_uids_dict[ssl_uid].split(' ') 193 | new_ssl_split = ssl_line.split(' ') 194 | for i in range(0, len(old_ssl_split)): 195 | if i <= 20: 196 | if old_ssl_split[i] != new_ssl_split[i]: 197 | logger.erro("SSL Error - ssl lines with same uid are not same! Path: {} SSL uid: {}".format(path_to_ssl_log, ssl_uid)) 198 | continue 199 | except: 200 | self.control_ssl_uids_dict[ssl_uid] = ssl_line 201 | 202 | # find flow in conn.log by this ssl uid. 203 | try: 204 | conn_log = self.conn_dict[ssl_uid] 205 | except: 206 | # conn_dict contains only normal or malware conn lines. Here there are read all ssl lines and 207 | # some ssl lines shows to background conn_line that are not contained in conn_dict. 208 | continue 209 | 210 | conn_split = conn_log.split(' ') 211 | # 2-srcIpAddress, 4-dstIpAddress, 5-dstPort, 6-Protocol 212 | connection_index = conn_split[2], conn_split[4], conn_split[5], conn_split[6] 213 | 214 | try: 215 | label = conn_split[21] 216 | except IndexError: 217 | logger.error("Error: no label in conn line. conn index: {}".format(connection_index)) 218 | 219 | if 'Background' in label or 'No_Label' in label: 220 | logger.error("Error: Backgroung label. conn index: {}".format(connection_index)) 221 | continue 222 | 223 | if not ('Botnet' in label) and not ('Normal') in label: 224 | logger.error("Error: Dear more, there are more states of labels !!!! conn index: {}".format(connection_index)) 225 | 226 | try: 227 | self.connection_4_tuples[connection_index].add_ssl_flow(conn_log, label) 228 | except: 229 | self.connection_4_tuples[connection_index] = ConnectionFeatures(connection_index) 230 | self.connection_4_tuples[connection_index].add_ssl_flow(conn_log, label) 231 | 232 | self.ssl_lines += 1 233 | # x509 and ssl 234 | valid_x509_list = self.split_ssl(ssl_line, connection_index, label) 235 | 236 | self.connection_4_tuples[connection_index].add_ssl_log(ssl_line, valid_x509_list, 237 | os.path.basename(path_to_ssl_log)) 238 | 239 | # For chceking certificate path, find x509 logs in cert path. 240 | ssl_split = ssl_line.split(' ') 241 | list_of_x509_uids = ssl_split[14].split(',') 242 | x509_lines_arr = [] 243 | is_founded = True 244 | for x509_uid in list_of_x509_uids: 245 | try: 246 | if self.x509_dict[x509_uid]: 247 | x509_lines_arr.append(self.x509_dict[x509_uid]) 248 | except: 249 | is_founded = False 250 | # break makes an error here. 251 | self.connection_4_tuples[connection_index].check_certificate_path(x509_lines_arr, is_founded) 252 | 253 | ssl_file.close() 254 | 255 | ''' 256 | Methods for adding not ssl flow from conn.log to connection-4tuple 257 | ''' 258 | 259 | def conn_logs_2(self, dataset_path_to_logs): 260 | print " << Read all conn logs again:" 261 | all_conn_logs = get_such_logs(dataset_path_to_logs, ['conn', '_label']) 262 | for conn_log in all_conn_logs: 263 | self.add_not_ssl_logs(dataset_path_to_logs + conn_log) 264 | print " << Loaded conn logs 2: ", len(all_conn_logs) 265 | 266 | def add_not_ssl_logs(self, path_to_conn): 267 | print " <<< adding not ssl flow:" 268 | with open(path_to_conn) as f: 269 | for line in f: 270 | if '#' == line[0]: 271 | continue 272 | conn_split = line.split(' ') 273 | # 2-srcIpAddress, 4-dstIpAddress, 5-dstPort, 6-Protocol 274 | if len(conn_split) < 7: 275 | continue 276 | 277 | connection_index = conn_split[2], conn_split[4], conn_split[5], conn_split[6] 278 | try: 279 | label = conn_split[21] 280 | except IndexError: 281 | label = "False" 282 | conn_uid = conn_split[1] 283 | 284 | if 'Background' in label or 'No_Label' in label: 285 | continue 286 | 287 | try: 288 | if self.connection_4_tuples[connection_index]: 289 | try: 290 | if self.connection_4_tuples[connection_index].get_uid_flow_dict()[conn_uid]: 291 | pass 292 | except: 293 | self.connection_4_tuples[connection_index].add_not_ssl_flow(line, label) 294 | except: 295 | # Connections which are normal or botnet but they don't have ssl 4-tuple object. 296 | pass 297 | f.close() 298 | 299 | """ 300 | ---------------------- DNS logs. ------------------------- 301 | """ 302 | 303 | def dns_logs(self, dataset_path_to_logs): 304 | logger.info("loading dns logs...") 305 | print " << Read all dns logs:" 306 | print "Reading dns logs:" 307 | self.dns_lines = 0 308 | all_dns_logs = get_such_logs(dataset_path_to_logs, ['dns']) 309 | for dns_log in all_dns_logs: 310 | self.read_dns_log(dataset_path_to_logs + dns_log) 311 | 312 | print " << Loaded dns logs: ", len(all_dns_logs) 313 | 314 | def read_dns_log(self, dataset_path_to_dns): 315 | try: 316 | with open(dataset_path_to_dns) as f: 317 | for line in f: 318 | split_dns_line = line.split('\t') 319 | if split_dns_line[0] == "#fields": 320 | headers = split_dns_line[1:] 321 | continue 322 | elif line[0] == '#': 323 | continue 324 | 325 | dns_record = dict(zip(headers, split_dns_line)) 326 | 327 | unknown_domain_names = ["(empty)", "immutableset"] 328 | if (dns_record['qtype_name'] == 'A' or dns_record['qtype_name'] == 'AAAA') and \ 329 | dns_record['query'] not in unknown_domain_names and '.' in dns_record['query']: 330 | dns_index = dns_record['query'] 331 | if dns_index in self.dns_connections: 332 | self.dns_connections[dns_index].add_dns_record(dns_record) 333 | for ip in self.dns_connections[dns_index].answers: 334 | self.dns_connections_index[ip] = self.dns_connections[dns_index] 335 | else: 336 | self.dns_connections[dns_index] = DNSFeatures(dns_index) 337 | self.dns_connections[dns_index].add_dns_record(dns_record) 338 | for ip in self.dns_connections[dns_index].answers: 339 | self.dns_connections_index[ip] = self.dns_connections[dns_index] 340 | 341 | self.dns_lines += 1 342 | 343 | f.close() 344 | except IOError: 345 | logger.error("Error: The dns file: {} does not exist.".format(dataset_path_to_dns)) 346 | 347 | """ 348 | ------------------------------------------------ 349 | --------------- Methods ------------------------ 350 | ------------------------------------------------ 351 | """ 352 | 353 | ''' 354 | Just checking function, that each x509uid from ssl log is found in x509 file. 355 | ''' 356 | def split_ssl(self, ssl_line, tuple_index, label): 357 | split = ssl_line.split(' ') 358 | if '-' == split[14] or '(object)' == split[14]: 359 | self.err_not_added_x509 += 1 360 | return [] 361 | self.put_server_name_to_dict(split[1], split[9], tuple_index, split[14], label) 362 | return self.get_x509_lines(split[14].split(',')) 363 | 364 | ''' 365 | This function returns x509 line which ssl log has inside his line as list of uid. 366 | ''' 367 | def get_x509_lines(self, x509_uids_list): 368 | x509_line = None 369 | uid_x509 = x509_uids_list[0] 370 | try: 371 | if self.x509_dict[uid_x509]: 372 | x509_line = self.x509_dict[uid_x509] 373 | self.founded_x509_lines += 1 374 | except: 375 | self.not_founded_x509_lines += 1 376 | return [] 377 | # print "Error: [get_x509_lines] In ProcessLogs.py x509 does not have this x509uid:", x509_uids_list[0] 378 | return [x509_line] 379 | 380 | # certificate dict 381 | def put_server_name_to_dict(self, ssl_uid, server_name, tuple_index, x509_uids_list, label): 382 | splited_x509_uids = x509_uids_list.split(',') 383 | uid_x509 = splited_x509_uids[0] 384 | try: 385 | if self.x509_dict[uid_x509]: 386 | x509_line = self.x509_dict[uid_x509] 387 | x509_split = x509_line.split(' ') 388 | cert_serial = x509_split[3] 389 | try: 390 | if self.certificate_dict[cert_serial]: 391 | self.certificate_dict[cert_serial].add_server_name(server_name, label) 392 | self.certificate_dict[cert_serial].add_x509_line(x509_line) 393 | except: 394 | self.certificate_dict[cert_serial] = CertificateFeatures(cert_serial, x509_line) 395 | self.certificate_dict[cert_serial].add_server_name(server_name, label) 396 | self.certificate_dict[cert_serial].add_x509_line(x509_line) 397 | except: 398 | logger.error("Error: [put_server_name] In ProcessLogs.py x509 does not have this x509uid: {}".format(uid_x509)) 399 | 400 | 401 | def get_such_logs(path_to_logs, part_name_list): 402 | searched_list = [] 403 | for searched_file in os.listdir(path_to_logs): 404 | if all(x in searched_file for x in part_name_list): 405 | searched_list.append(searched_file) 406 | return searched_list 407 | -------------------------------------------------------------------------------- /features_extraction/MainBro.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(0, os.environ['HOME'] + '/BotnetDetectionThesis/') 4 | 5 | from time import time 6 | import datetime 7 | from ComputeFeatures import ComputeFeatures 8 | import config as c 9 | import os 10 | from DNSFeatures import DNSFeatures 11 | from logger import get_logger 12 | 13 | logger = get_logger('debug') 14 | 15 | def main(): 16 | # Start to count the time. 17 | start_time = time() 18 | 19 | # Create new instance. 20 | extract_features = ComputeFeatures() 21 | 22 | print " << Loading top alexa: " 23 | DNSFeatures.load_all_top_alexa() 24 | print " << Loaded top alexa: " 25 | 26 | # Go throw all subset in dataset. 27 | index = 1 28 | for sub_set in os.listdir(c.datasets_folder): 29 | if sub_set.startswith("."): 30 | continue 31 | logger.info("--------------------------------------------------------") 32 | logger.info("-------- #{} {} extraction".format(index, sub_set)) 33 | logger.info("--------------------------------------------------------") 34 | 35 | extract_features.extraction_manager(c.datasets_folder + sub_set + '/bro/') 36 | index += 1 37 | 38 | # Add certificate to connections that does not contain any certificate. 39 | extract_features.add_cert_to_non_cert_conn() 40 | 41 | # Compute features and save them. 42 | #extract_features.create_dataset_dns() 43 | logger.info("computing features...") 44 | extract_features.create_balanced_dataset() 45 | 46 | # Print final statistic 47 | extract_features.print_statistic() 48 | # Extract_features.compute_features() 49 | extract_features.save_dataset_information() 50 | 51 | total_time = datetime.timedelta(seconds=time() - start_time) 52 | print "<<< All dataset successfully finished in aproximate time: " + str(total_time) 53 | 54 | 55 | if __name__ == '__main__': 56 | main() 57 | -------------------------------------------------------------------------------- /features_extraction/__init.py__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lminy/BotnetDetectionThesis/5a54541229a6d7255f0eebe65aaf8b1c35b9be04/features_extraction/__init.py__.py -------------------------------------------------------------------------------- /features_extraction/top_level_domain: -------------------------------------------------------------------------------- 1 | # Version 2017090700, Last Updated Thu Sep 7 07:07:01 2017 UTC 2 | AAA 3 | AARP 4 | ABARTH 5 | ABB 6 | ABBOTT 7 | ABBVIE 8 | ABC 9 | ABLE 10 | ABOGADO 11 | ABUDHABI 12 | AC 13 | ACADEMY 14 | ACCENTURE 15 | ACCOUNTANT 16 | ACCOUNTANTS 17 | ACO 18 | ACTIVE 19 | ACTOR 20 | AD 21 | ADAC 22 | ADS 23 | ADULT 24 | AE 25 | AEG 26 | AERO 27 | AETNA 28 | AF 29 | AFAMILYCOMPANY 30 | AFL 31 | AFRICA 32 | AG 33 | AGAKHAN 34 | AGENCY 35 | AI 36 | AIG 37 | AIGO 38 | AIRBUS 39 | AIRFORCE 40 | AIRTEL 41 | AKDN 42 | AL 43 | ALFAROMEO 44 | ALIBABA 45 | ALIPAY 46 | ALLFINANZ 47 | ALLSTATE 48 | ALLY 49 | ALSACE 50 | ALSTOM 51 | AM 52 | AMERICANEXPRESS 53 | AMERICANFAMILY 54 | AMEX 55 | AMFAM 56 | AMICA 57 | AMSTERDAM 58 | ANALYTICS 59 | ANDROID 60 | ANQUAN 61 | ANZ 62 | AO 63 | AOL 64 | APARTMENTS 65 | APP 66 | APPLE 67 | AQ 68 | AQUARELLE 69 | AR 70 | ARAB 71 | ARAMCO 72 | ARCHI 73 | ARMY 74 | ARPA 75 | ART 76 | ARTE 77 | AS 78 | ASDA 79 | ASIA 80 | ASSOCIATES 81 | AT 82 | ATHLETA 83 | ATTORNEY 84 | AU 85 | AUCTION 86 | AUDI 87 | AUDIBLE 88 | AUDIO 89 | AUSPOST 90 | AUTHOR 91 | AUTO 92 | AUTOS 93 | AVIANCA 94 | AW 95 | AWS 96 | AX 97 | AXA 98 | AZ 99 | AZURE 100 | BA 101 | BABY 102 | BAIDU 103 | BANAMEX 104 | BANANAREPUBLIC 105 | BAND 106 | BANK 107 | BAR 108 | BARCELONA 109 | BARCLAYCARD 110 | BARCLAYS 111 | BAREFOOT 112 | BARGAINS 113 | BASEBALL 114 | BASKETBALL 115 | BAUHAUS 116 | BAYERN 117 | BB 118 | BBC 119 | BBT 120 | BBVA 121 | BCG 122 | BCN 123 | BD 124 | BE 125 | BEATS 126 | BEAUTY 127 | BEER 128 | BENTLEY 129 | BERLIN 130 | BEST 131 | BESTBUY 132 | BET 133 | BF 134 | BG 135 | BH 136 | BHARTI 137 | BI 138 | BIBLE 139 | BID 140 | BIKE 141 | BING 142 | BINGO 143 | BIO 144 | BIZ 145 | BJ 146 | BLACK 147 | BLACKFRIDAY 148 | BLANCO 149 | BLOCKBUSTER 150 | BLOG 151 | BLOOMBERG 152 | BLUE 153 | BM 154 | BMS 155 | BMW 156 | BN 157 | BNL 158 | BNPPARIBAS 159 | BO 160 | BOATS 161 | BOEHRINGER 162 | BOFA 163 | BOM 164 | BOND 165 | BOO 166 | BOOK 167 | BOOKING 168 | BOOTS 169 | BOSCH 170 | BOSTIK 171 | BOSTON 172 | BOT 173 | BOUTIQUE 174 | BOX 175 | BR 176 | BRADESCO 177 | BRIDGESTONE 178 | BROADWAY 179 | BROKER 180 | BROTHER 181 | BRUSSELS 182 | BS 183 | BT 184 | BUDAPEST 185 | BUGATTI 186 | BUILD 187 | BUILDERS 188 | BUSINESS 189 | BUY 190 | BUZZ 191 | BV 192 | BW 193 | BY 194 | BZ 195 | BZH 196 | CA 197 | CAB 198 | CAFE 199 | CAL 200 | CALL 201 | CALVINKLEIN 202 | CAM 203 | CAMERA 204 | CAMP 205 | CANCERRESEARCH 206 | CANON 207 | CAPETOWN 208 | CAPITAL 209 | CAPITALONE 210 | CAR 211 | CARAVAN 212 | CARDS 213 | CARE 214 | CAREER 215 | CAREERS 216 | CARS 217 | CARTIER 218 | CASA 219 | CASE 220 | CASEIH 221 | CASH 222 | CASINO 223 | CAT 224 | CATERING 225 | CATHOLIC 226 | CBA 227 | CBN 228 | CBRE 229 | CBS 230 | CC 231 | CD 232 | CEB 233 | CENTER 234 | CEO 235 | CERN 236 | CF 237 | CFA 238 | CFD 239 | CG 240 | CH 241 | CHANEL 242 | CHANNEL 243 | CHASE 244 | CHAT 245 | CHEAP 246 | CHINTAI 247 | CHLOE 248 | CHRISTMAS 249 | CHROME 250 | CHRYSLER 251 | CHURCH 252 | CI 253 | CIPRIANI 254 | CIRCLE 255 | CISCO 256 | CITADEL 257 | CITI 258 | CITIC 259 | CITY 260 | CITYEATS 261 | CK 262 | CL 263 | CLAIMS 264 | CLEANING 265 | CLICK 266 | CLINIC 267 | CLINIQUE 268 | CLOTHING 269 | CLOUD 270 | CLUB 271 | CLUBMED 272 | CM 273 | CN 274 | CO 275 | COACH 276 | CODES 277 | COFFEE 278 | COLLEGE 279 | COLOGNE 280 | COM 281 | COMCAST 282 | COMMBANK 283 | COMMUNITY 284 | COMPANY 285 | COMPARE 286 | COMPUTER 287 | COMSEC 288 | CONDOS 289 | CONSTRUCTION 290 | CONSULTING 291 | CONTACT 292 | CONTRACTORS 293 | COOKING 294 | COOKINGCHANNEL 295 | COOL 296 | COOP 297 | CORSICA 298 | COUNTRY 299 | COUPON 300 | COUPONS 301 | COURSES 302 | CR 303 | CREDIT 304 | CREDITCARD 305 | CREDITUNION 306 | CRICKET 307 | CROWN 308 | CRS 309 | CRUISE 310 | CRUISES 311 | CSC 312 | CU 313 | CUISINELLA 314 | CV 315 | CW 316 | CX 317 | CY 318 | CYMRU 319 | CYOU 320 | CZ 321 | DABUR 322 | DAD 323 | DANCE 324 | DATA 325 | DATE 326 | DATING 327 | DATSUN 328 | DAY 329 | DCLK 330 | DDS 331 | DE 332 | DEAL 333 | DEALER 334 | DEALS 335 | DEGREE 336 | DELIVERY 337 | DELL 338 | DELOITTE 339 | DELTA 340 | DEMOCRAT 341 | DENTAL 342 | DENTIST 343 | DESI 344 | DESIGN 345 | DEV 346 | DHL 347 | DIAMONDS 348 | DIET 349 | DIGITAL 350 | DIRECT 351 | DIRECTORY 352 | DISCOUNT 353 | DISCOVER 354 | DISH 355 | DIY 356 | DJ 357 | DK 358 | DM 359 | DNP 360 | DO 361 | DOCS 362 | DOCTOR 363 | DODGE 364 | DOG 365 | DOHA 366 | DOMAINS 367 | DOT 368 | DOWNLOAD 369 | DRIVE 370 | DTV 371 | DUBAI 372 | DUCK 373 | DUNLOP 374 | DUNS 375 | DUPONT 376 | DURBAN 377 | DVAG 378 | DVR 379 | DZ 380 | EARTH 381 | EAT 382 | EC 383 | ECO 384 | EDEKA 385 | EDU 386 | EDUCATION 387 | EE 388 | EG 389 | EMAIL 390 | EMERCK 391 | ENERGY 392 | ENGINEER 393 | ENGINEERING 394 | ENTERPRISES 395 | EPOST 396 | EPSON 397 | EQUIPMENT 398 | ER 399 | ERICSSON 400 | ERNI 401 | ES 402 | ESQ 403 | ESTATE 404 | ESURANCE 405 | ET 406 | ETISALAT 407 | EU 408 | EUROVISION 409 | EUS 410 | EVENTS 411 | EVERBANK 412 | EXCHANGE 413 | EXPERT 414 | EXPOSED 415 | EXPRESS 416 | EXTRASPACE 417 | FAGE 418 | FAIL 419 | FAIRWINDS 420 | FAITH 421 | FAMILY 422 | FAN 423 | FANS 424 | FARM 425 | FARMERS 426 | FASHION 427 | FAST 428 | FEDEX 429 | FEEDBACK 430 | FERRARI 431 | FERRERO 432 | FI 433 | FIAT 434 | FIDELITY 435 | FIDO 436 | FILM 437 | FINAL 438 | FINANCE 439 | FINANCIAL 440 | FIRE 441 | FIRESTONE 442 | FIRMDALE 443 | FISH 444 | FISHING 445 | FIT 446 | FITNESS 447 | FJ 448 | FK 449 | FLICKR 450 | FLIGHTS 451 | FLIR 452 | FLORIST 453 | FLOWERS 454 | FLY 455 | FM 456 | FO 457 | FOO 458 | FOOD 459 | FOODNETWORK 460 | FOOTBALL 461 | FORD 462 | FOREX 463 | FORSALE 464 | FORUM 465 | FOUNDATION 466 | FOX 467 | FR 468 | FREE 469 | FRESENIUS 470 | FRL 471 | FROGANS 472 | FRONTDOOR 473 | FRONTIER 474 | FTR 475 | FUJITSU 476 | FUJIXEROX 477 | FUN 478 | FUND 479 | FURNITURE 480 | FUTBOL 481 | FYI 482 | GA 483 | GAL 484 | GALLERY 485 | GALLO 486 | GALLUP 487 | GAME 488 | GAMES 489 | GAP 490 | GARDEN 491 | GB 492 | GBIZ 493 | GD 494 | GDN 495 | GE 496 | GEA 497 | GENT 498 | GENTING 499 | GEORGE 500 | GF 501 | GG 502 | GGEE 503 | GH 504 | GI 505 | GIFT 506 | GIFTS 507 | GIVES 508 | GIVING 509 | GL 510 | GLADE 511 | GLASS 512 | GLE 513 | GLOBAL 514 | GLOBO 515 | GM 516 | GMAIL 517 | GMBH 518 | GMO 519 | GMX 520 | GN 521 | GODADDY 522 | GOLD 523 | GOLDPOINT 524 | GOLF 525 | GOO 526 | GOODHANDS 527 | GOODYEAR 528 | GOOG 529 | GOOGLE 530 | GOP 531 | GOT 532 | GOV 533 | GP 534 | GQ 535 | GR 536 | GRAINGER 537 | GRAPHICS 538 | GRATIS 539 | GREEN 540 | GRIPE 541 | GROCERY 542 | GROUP 543 | GS 544 | GT 545 | GU 546 | GUARDIAN 547 | GUCCI 548 | GUGE 549 | GUIDE 550 | GUITARS 551 | GURU 552 | GW 553 | GY 554 | HAIR 555 | HAMBURG 556 | HANGOUT 557 | HAUS 558 | HBO 559 | HDFC 560 | HDFCBANK 561 | HEALTH 562 | HEALTHCARE 563 | HELP 564 | HELSINKI 565 | HERE 566 | HERMES 567 | HGTV 568 | HIPHOP 569 | HISAMITSU 570 | HITACHI 571 | HIV 572 | HK 573 | HKT 574 | HM 575 | HN 576 | HOCKEY 577 | HOLDINGS 578 | HOLIDAY 579 | HOMEDEPOT 580 | HOMEGOODS 581 | HOMES 582 | HOMESENSE 583 | HONDA 584 | HONEYWELL 585 | HORSE 586 | HOSPITAL 587 | HOST 588 | HOSTING 589 | HOT 590 | HOTELES 591 | HOTELS 592 | HOTMAIL 593 | HOUSE 594 | HOW 595 | HR 596 | HSBC 597 | HT 598 | HTC 599 | HU 600 | HUGHES 601 | HYATT 602 | HYUNDAI 603 | IBM 604 | ICBC 605 | ICE 606 | ICU 607 | ID 608 | IE 609 | IEEE 610 | IFM 611 | IKANO 612 | IL 613 | IM 614 | IMAMAT 615 | IMDB 616 | IMMO 617 | IMMOBILIEN 618 | IN 619 | INDUSTRIES 620 | INFINITI 621 | INFO 622 | ING 623 | INK 624 | INSTITUTE 625 | INSURANCE 626 | INSURE 627 | INT 628 | INTEL 629 | INTERNATIONAL 630 | INTUIT 631 | INVESTMENTS 632 | IO 633 | IPIRANGA 634 | IQ 635 | IR 636 | IRISH 637 | IS 638 | ISELECT 639 | ISMAILI 640 | IST 641 | ISTANBUL 642 | IT 643 | ITAU 644 | ITV 645 | IVECO 646 | IWC 647 | JAGUAR 648 | JAVA 649 | JCB 650 | JCP 651 | JE 652 | JEEP 653 | JETZT 654 | JEWELRY 655 | JIO 656 | JLC 657 | JLL 658 | JM 659 | JMP 660 | JNJ 661 | JO 662 | JOBS 663 | JOBURG 664 | JOT 665 | JOY 666 | JP 667 | JPMORGAN 668 | JPRS 669 | JUEGOS 670 | JUNIPER 671 | KAUFEN 672 | KDDI 673 | KE 674 | KERRYHOTELS 675 | KERRYLOGISTICS 676 | KERRYPROPERTIES 677 | KFH 678 | KG 679 | KH 680 | KI 681 | KIA 682 | KIM 683 | KINDER 684 | KINDLE 685 | KITCHEN 686 | KIWI 687 | KM 688 | KN 689 | KOELN 690 | KOMATSU 691 | KOSHER 692 | KP 693 | KPMG 694 | KPN 695 | KR 696 | KRD 697 | KRED 698 | KUOKGROUP 699 | KW 700 | KY 701 | KYOTO 702 | KZ 703 | LA 704 | LACAIXA 705 | LADBROKES 706 | LAMBORGHINI 707 | LAMER 708 | LANCASTER 709 | LANCIA 710 | LANCOME 711 | LAND 712 | LANDROVER 713 | LANXESS 714 | LASALLE 715 | LAT 716 | LATINO 717 | LATROBE 718 | LAW 719 | LAWYER 720 | LB 721 | LC 722 | LDS 723 | LEASE 724 | LECLERC 725 | LEFRAK 726 | LEGAL 727 | LEGO 728 | LEXUS 729 | LGBT 730 | LI 731 | LIAISON 732 | LIDL 733 | LIFE 734 | LIFEINSURANCE 735 | LIFESTYLE 736 | LIGHTING 737 | LIKE 738 | LILLY 739 | LIMITED 740 | LIMO 741 | LINCOLN 742 | LINDE 743 | LINK 744 | LIPSY 745 | LIVE 746 | LIVING 747 | LIXIL 748 | LK 749 | LOAN 750 | LOANS 751 | LOCKER 752 | LOCUS 753 | LOFT 754 | LOL 755 | LONDON 756 | LOTTE 757 | LOTTO 758 | LOVE 759 | LPL 760 | LPLFINANCIAL 761 | LR 762 | LS 763 | LT 764 | LTD 765 | LTDA 766 | LU 767 | LUNDBECK 768 | LUPIN 769 | LUXE 770 | LUXURY 771 | LV 772 | LY 773 | MA 774 | MACYS 775 | MADRID 776 | MAIF 777 | MAISON 778 | MAKEUP 779 | MAN 780 | MANAGEMENT 781 | MANGO 782 | MAP 783 | MARKET 784 | MARKETING 785 | MARKETS 786 | MARRIOTT 787 | MARSHALLS 788 | MASERATI 789 | MATTEL 790 | MBA 791 | MC 792 | MCKINSEY 793 | MD 794 | ME 795 | MED 796 | MEDIA 797 | MEET 798 | MELBOURNE 799 | MEME 800 | MEMORIAL 801 | MEN 802 | MENU 803 | MEO 804 | MERCKMSD 805 | METLIFE 806 | MG 807 | MH 808 | MIAMI 809 | MICROSOFT 810 | MIL 811 | MINI 812 | MINT 813 | MIT 814 | MITSUBISHI 815 | MK 816 | ML 817 | MLB 818 | MLS 819 | MM 820 | MMA 821 | MN 822 | MO 823 | MOBI 824 | MOBILE 825 | MOBILY 826 | MODA 827 | MOE 828 | MOI 829 | MOM 830 | MONASH 831 | MONEY 832 | MONSTER 833 | MOPAR 834 | MORMON 835 | MORTGAGE 836 | MOSCOW 837 | MOTO 838 | MOTORCYCLES 839 | MOV 840 | MOVIE 841 | MOVISTAR 842 | MP 843 | MQ 844 | MR 845 | MS 846 | MSD 847 | MT 848 | MTN 849 | MTR 850 | MU 851 | MUSEUM 852 | MUTUAL 853 | MV 854 | MW 855 | MX 856 | MY 857 | MZ 858 | NA 859 | NAB 860 | NADEX 861 | NAGOYA 862 | NAME 863 | NATIONWIDE 864 | NATURA 865 | NAVY 866 | NBA 867 | NC 868 | NE 869 | NEC 870 | NET 871 | NETBANK 872 | NETFLIX 873 | NETWORK 874 | NEUSTAR 875 | NEW 876 | NEWHOLLAND 877 | NEWS 878 | NEXT 879 | NEXTDIRECT 880 | NEXUS 881 | NF 882 | NFL 883 | NG 884 | NGO 885 | NHK 886 | NI 887 | NICO 888 | NIKE 889 | NIKON 890 | NINJA 891 | NISSAN 892 | NISSAY 893 | NL 894 | NO 895 | NOKIA 896 | NORTHWESTERNMUTUAL 897 | NORTON 898 | NOW 899 | NOWRUZ 900 | NOWTV 901 | NP 902 | NR 903 | NRA 904 | NRW 905 | NTT 906 | NU 907 | NYC 908 | NZ 909 | OBI 910 | OBSERVER 911 | OFF 912 | OFFICE 913 | OKINAWA 914 | OLAYAN 915 | OLAYANGROUP 916 | OLDNAVY 917 | OLLO 918 | OM 919 | OMEGA 920 | ONE 921 | ONG 922 | ONL 923 | ONLINE 924 | ONYOURSIDE 925 | OOO 926 | OPEN 927 | ORACLE 928 | ORANGE 929 | ORG 930 | ORGANIC 931 | ORIGINS 932 | OSAKA 933 | OTSUKA 934 | OTT 935 | OVH 936 | PA 937 | PAGE 938 | PAMPEREDCHEF 939 | PANASONIC 940 | PANERAI 941 | PARIS 942 | PARS 943 | PARTNERS 944 | PARTS 945 | PARTY 946 | PASSAGENS 947 | PAY 948 | PCCW 949 | PE 950 | PET 951 | PF 952 | PFIZER 953 | PG 954 | PH 955 | PHARMACY 956 | PHD 957 | PHILIPS 958 | PHONE 959 | PHOTO 960 | PHOTOGRAPHY 961 | PHOTOS 962 | PHYSIO 963 | PIAGET 964 | PICS 965 | PICTET 966 | PICTURES 967 | PID 968 | PIN 969 | PING 970 | PINK 971 | PIONEER 972 | PIZZA 973 | PK 974 | PL 975 | PLACE 976 | PLAY 977 | PLAYSTATION 978 | PLUMBING 979 | PLUS 980 | PM 981 | PN 982 | PNC 983 | POHL 984 | POKER 985 | POLITIE 986 | PORN 987 | POST 988 | PR 989 | PRAMERICA 990 | PRAXI 991 | PRESS 992 | PRIME 993 | PRO 994 | PROD 995 | PRODUCTIONS 996 | PROF 997 | PROGRESSIVE 998 | PROMO 999 | PROPERTIES 1000 | PROPERTY 1001 | PROTECTION 1002 | PRU 1003 | PRUDENTIAL 1004 | PS 1005 | PT 1006 | PUB 1007 | PW 1008 | PWC 1009 | PY 1010 | QA 1011 | QPON 1012 | QUEBEC 1013 | QUEST 1014 | QVC 1015 | RACING 1016 | RADIO 1017 | RAID 1018 | RE 1019 | READ 1020 | REALESTATE 1021 | REALTOR 1022 | REALTY 1023 | RECIPES 1024 | RED 1025 | REDSTONE 1026 | REDUMBRELLA 1027 | REHAB 1028 | REISE 1029 | REISEN 1030 | REIT 1031 | RELIANCE 1032 | REN 1033 | RENT 1034 | RENTALS 1035 | REPAIR 1036 | REPORT 1037 | REPUBLICAN 1038 | REST 1039 | RESTAURANT 1040 | REVIEW 1041 | REVIEWS 1042 | REXROTH 1043 | RICH 1044 | RICHARDLI 1045 | RICOH 1046 | RIGHTATHOME 1047 | RIL 1048 | RIO 1049 | RIP 1050 | RMIT 1051 | RO 1052 | ROCHER 1053 | ROCKS 1054 | RODEO 1055 | ROGERS 1056 | ROOM 1057 | RS 1058 | RSVP 1059 | RU 1060 | RUGBY 1061 | RUHR 1062 | RUN 1063 | RW 1064 | RWE 1065 | RYUKYU 1066 | SA 1067 | SAARLAND 1068 | SAFE 1069 | SAFETY 1070 | SAKURA 1071 | SALE 1072 | SALON 1073 | SAMSCLUB 1074 | SAMSUNG 1075 | SANDVIK 1076 | SANDVIKCOROMANT 1077 | SANOFI 1078 | SAP 1079 | SAPO 1080 | SARL 1081 | SAS 1082 | SAVE 1083 | SAXO 1084 | SB 1085 | SBI 1086 | SBS 1087 | SC 1088 | SCA 1089 | SCB 1090 | SCHAEFFLER 1091 | SCHMIDT 1092 | SCHOLARSHIPS 1093 | SCHOOL 1094 | SCHULE 1095 | SCHWARZ 1096 | SCIENCE 1097 | SCJOHNSON 1098 | SCOR 1099 | SCOT 1100 | SD 1101 | SE 1102 | SEARCH 1103 | SEAT 1104 | SECURE 1105 | SECURITY 1106 | SEEK 1107 | SELECT 1108 | SENER 1109 | SERVICES 1110 | SES 1111 | SEVEN 1112 | SEW 1113 | SEX 1114 | SEXY 1115 | SFR 1116 | SG 1117 | SH 1118 | SHANGRILA 1119 | SHARP 1120 | SHAW 1121 | SHELL 1122 | SHIA 1123 | SHIKSHA 1124 | SHOES 1125 | SHOP 1126 | SHOPPING 1127 | SHOUJI 1128 | SHOW 1129 | SHOWTIME 1130 | SHRIRAM 1131 | SI 1132 | SILK 1133 | SINA 1134 | SINGLES 1135 | SITE 1136 | SJ 1137 | SK 1138 | SKI 1139 | SKIN 1140 | SKY 1141 | SKYPE 1142 | SL 1143 | SLING 1144 | SM 1145 | SMART 1146 | SMILE 1147 | SN 1148 | SNCF 1149 | SO 1150 | SOCCER 1151 | SOCIAL 1152 | SOFTBANK 1153 | SOFTWARE 1154 | SOHU 1155 | SOLAR 1156 | SOLUTIONS 1157 | SONG 1158 | SONY 1159 | SOY 1160 | SPACE 1161 | SPIEGEL 1162 | SPOT 1163 | SPREADBETTING 1164 | SR 1165 | SRL 1166 | SRT 1167 | ST 1168 | STADA 1169 | STAPLES 1170 | STAR 1171 | STARHUB 1172 | STATEBANK 1173 | STATEFARM 1174 | STATOIL 1175 | STC 1176 | STCGROUP 1177 | STOCKHOLM 1178 | STORAGE 1179 | STORE 1180 | STREAM 1181 | STUDIO 1182 | STUDY 1183 | STYLE 1184 | SU 1185 | SUCKS 1186 | SUPPLIES 1187 | SUPPLY 1188 | SUPPORT 1189 | SURF 1190 | SURGERY 1191 | SUZUKI 1192 | SV 1193 | SWATCH 1194 | SWIFTCOVER 1195 | SWISS 1196 | SX 1197 | SY 1198 | SYDNEY 1199 | SYMANTEC 1200 | SYSTEMS 1201 | SZ 1202 | TAB 1203 | TAIPEI 1204 | TALK 1205 | TAOBAO 1206 | TARGET 1207 | TATAMOTORS 1208 | TATAR 1209 | TATTOO 1210 | TAX 1211 | TAXI 1212 | TC 1213 | TCI 1214 | TD 1215 | TDK 1216 | TEAM 1217 | TECH 1218 | TECHNOLOGY 1219 | TEL 1220 | TELECITY 1221 | TELEFONICA 1222 | TEMASEK 1223 | TENNIS 1224 | TEVA 1225 | TF 1226 | TG 1227 | TH 1228 | THD 1229 | THEATER 1230 | THEATRE 1231 | TIAA 1232 | TICKETS 1233 | TIENDA 1234 | TIFFANY 1235 | TIPS 1236 | TIRES 1237 | TIROL 1238 | TJ 1239 | TJMAXX 1240 | TJX 1241 | TK 1242 | TKMAXX 1243 | TL 1244 | TM 1245 | TMALL 1246 | TN 1247 | TO 1248 | TODAY 1249 | TOKYO 1250 | TOOLS 1251 | TOP 1252 | TORAY 1253 | TOSHIBA 1254 | TOTAL 1255 | TOURS 1256 | TOWN 1257 | TOYOTA 1258 | TOYS 1259 | TR 1260 | TRADE 1261 | TRADING 1262 | TRAINING 1263 | TRAVEL 1264 | TRAVELCHANNEL 1265 | TRAVELERS 1266 | TRAVELERSINSURANCE 1267 | TRUST 1268 | TRV 1269 | TT 1270 | TUBE 1271 | TUI 1272 | TUNES 1273 | TUSHU 1274 | TV 1275 | TVS 1276 | TW 1277 | TZ 1278 | UA 1279 | UBANK 1280 | UBS 1281 | UCONNECT 1282 | UG 1283 | UK 1284 | UNICOM 1285 | UNIVERSITY 1286 | UNO 1287 | UOL 1288 | UPS 1289 | US 1290 | UY 1291 | UZ 1292 | VA 1293 | VACATIONS 1294 | VANA 1295 | VANGUARD 1296 | VC 1297 | VE 1298 | VEGAS 1299 | VENTURES 1300 | VERISIGN 1301 | VERSICHERUNG 1302 | VET 1303 | VG 1304 | VI 1305 | VIAJES 1306 | VIDEO 1307 | VIG 1308 | VIKING 1309 | VILLAS 1310 | VIN 1311 | VIP 1312 | VIRGIN 1313 | VISA 1314 | VISION 1315 | VISTA 1316 | VISTAPRINT 1317 | VIVA 1318 | VIVO 1319 | VLAANDEREN 1320 | VN 1321 | VODKA 1322 | VOLKSWAGEN 1323 | VOLVO 1324 | VOTE 1325 | VOTING 1326 | VOTO 1327 | VOYAGE 1328 | VU 1329 | VUELOS 1330 | WALES 1331 | WALMART 1332 | WALTER 1333 | WANG 1334 | WANGGOU 1335 | WARMAN 1336 | WATCH 1337 | WATCHES 1338 | WEATHER 1339 | WEATHERCHANNEL 1340 | WEBCAM 1341 | WEBER 1342 | WEBSITE 1343 | WED 1344 | WEDDING 1345 | WEIBO 1346 | WEIR 1347 | WF 1348 | WHOSWHO 1349 | WIEN 1350 | WIKI 1351 | WILLIAMHILL 1352 | WIN 1353 | WINDOWS 1354 | WINE 1355 | WINNERS 1356 | WME 1357 | WOLTERSKLUWER 1358 | WOODSIDE 1359 | WORK 1360 | WORKS 1361 | WORLD 1362 | WOW 1363 | WS 1364 | WTC 1365 | WTF 1366 | XBOX 1367 | XEROX 1368 | XFINITY 1369 | XIHUAN 1370 | XIN 1371 | XN--11B4C3D 1372 | XN--1CK2E1B 1373 | XN--1QQW23A 1374 | XN--2SCRJ9C 1375 | XN--30RR7Y 1376 | XN--3BST00M 1377 | XN--3DS443G 1378 | XN--3E0B707E 1379 | XN--3HCRJ9C 1380 | XN--3OQ18VL8PN36A 1381 | XN--3PXU8K 1382 | XN--42C2D9A 1383 | XN--45BR5CYL 1384 | XN--45BRJ9C 1385 | XN--45Q11C 1386 | XN--4GBRIM 1387 | XN--54B7FTA0CC 1388 | XN--55QW42G 1389 | XN--55QX5D 1390 | XN--5SU34J936BGSG 1391 | XN--5TZM5G 1392 | XN--6FRZ82G 1393 | XN--6QQ986B3XL 1394 | XN--80ADXHKS 1395 | XN--80AO21A 1396 | XN--80AQECDR1A 1397 | XN--80ASEHDB 1398 | XN--80ASWG 1399 | XN--8Y0A063A 1400 | XN--90A3AC 1401 | XN--90AE 1402 | XN--90AIS 1403 | XN--9DBQ2A 1404 | XN--9ET52U 1405 | XN--9KRT00A 1406 | XN--B4W605FERD 1407 | XN--BCK1B9A5DRE4C 1408 | XN--C1AVG 1409 | XN--C2BR7G 1410 | XN--CCK2B3B 1411 | XN--CG4BKI 1412 | XN--CLCHC0EA0B2G2A9GCD 1413 | XN--CZR694B 1414 | XN--CZRS0T 1415 | XN--CZRU2D 1416 | XN--D1ACJ3B 1417 | XN--D1ALF 1418 | XN--E1A4C 1419 | XN--ECKVDTC9D 1420 | XN--EFVY88H 1421 | XN--ESTV75G 1422 | XN--FCT429K 1423 | XN--FHBEI 1424 | XN--FIQ228C5HS 1425 | XN--FIQ64B 1426 | XN--FIQS8S 1427 | XN--FIQZ9S 1428 | XN--FJQ720A 1429 | XN--FLW351E 1430 | XN--FPCRJ9C3D 1431 | XN--FZC2C9E2C 1432 | XN--FZYS8D69UVGM 1433 | XN--G2XX48C 1434 | XN--GCKR3F0F 1435 | XN--GECRJ9C 1436 | XN--GK3AT1E 1437 | XN--H2BREG3EVE 1438 | XN--H2BRJ9C 1439 | XN--H2BRJ9C8C 1440 | XN--HXT814E 1441 | XN--I1B6B1A6A2E 1442 | XN--IMR513N 1443 | XN--IO0A7I 1444 | XN--J1AEF 1445 | XN--J1AMH 1446 | XN--J6W193G 1447 | XN--JLQ61U9W7B 1448 | XN--JVR189M 1449 | XN--KCRX77D1X4A 1450 | XN--KPRW13D 1451 | XN--KPRY57D 1452 | XN--KPU716F 1453 | XN--KPUT3I 1454 | XN--L1ACC 1455 | XN--LGBBAT1AD8J 1456 | XN--MGB9AWBF 1457 | XN--MGBA3A3EJT 1458 | XN--MGBA3A4F16A 1459 | XN--MGBA7C0BBN0A 1460 | XN--MGBAAKC7DVF 1461 | XN--MGBAAM7A8H 1462 | XN--MGBAB2BD 1463 | XN--MGBAI9AZGQP6J 1464 | XN--MGBAYH7GPA 1465 | XN--MGBB9FBPOB 1466 | XN--MGBBH1A 1467 | XN--MGBBH1A71E 1468 | XN--MGBC0A9AZCG 1469 | XN--MGBCA7DZDO 1470 | XN--MGBERP4A5D4AR 1471 | XN--MGBGU82A 1472 | XN--MGBI4ECEXP 1473 | XN--MGBPL2FH 1474 | XN--MGBT3DHD 1475 | XN--MGBTX2B 1476 | XN--MGBX4CD0AB 1477 | XN--MIX891F 1478 | XN--MK1BU44C 1479 | XN--MXTQ1M 1480 | XN--NGBC5AZD 1481 | XN--NGBE9E0A 1482 | XN--NGBRX 1483 | XN--NODE 1484 | XN--NQV7F 1485 | XN--NQV7FS00EMA 1486 | XN--NYQY26A 1487 | XN--O3CW4H 1488 | XN--OGBPF8FL 1489 | XN--P1ACF 1490 | XN--P1AI 1491 | XN--PBT977C 1492 | XN--PGBS0DH 1493 | XN--PSSY2U 1494 | XN--Q9JYB4C 1495 | XN--QCKA1PMC 1496 | XN--QXAM 1497 | XN--RHQV96G 1498 | XN--ROVU88B 1499 | XN--RVC1E0AM3E 1500 | XN--S9BRJ9C 1501 | XN--SES554G 1502 | XN--T60B56A 1503 | XN--TCKWE 1504 | XN--TIQ49XQYJ 1505 | XN--UNUP4Y 1506 | XN--VERMGENSBERATER-CTB 1507 | XN--VERMGENSBERATUNG-PWB 1508 | XN--VHQUV 1509 | XN--VUQ861B 1510 | XN--W4R85EL8FHU5DNRA 1511 | XN--W4RS40L 1512 | XN--WGBH1C 1513 | XN--WGBL6A 1514 | XN--XHQ521B 1515 | XN--XKC2AL3HYE2A 1516 | XN--XKC2DL3A5EE0H 1517 | XN--Y9A3AQ 1518 | XN--YFRO4I67O 1519 | XN--YGBI2AMMX 1520 | XN--ZFR164B 1521 | XPERIA 1522 | XXX 1523 | XYZ 1524 | YACHTS 1525 | YAHOO 1526 | YAMAXUN 1527 | YANDEX 1528 | YE 1529 | YODOBASHI 1530 | YOGA 1531 | YOKOHAMA 1532 | YOU 1533 | YOUTUBE 1534 | YT 1535 | YUN 1536 | ZA 1537 | ZAPPOS 1538 | ZARA 1539 | ZERO 1540 | ZIP 1541 | ZIPPO 1542 | ZM 1543 | ZONE 1544 | ZUERICH 1545 | ZW -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | # Singleton logger 2 | 3 | import logging 4 | import config as c 5 | 6 | logger = None 7 | 8 | 9 | def get_logger(loglevel, append=True): 10 | global logger 11 | if logger is not None: 12 | return logger 13 | 14 | numeric_level = getattr(logging, loglevel.upper(), logging.INFO) 15 | if not isinstance(numeric_level, int): 16 | raise ValueError('Invalid log level: %s' % loglevel) 17 | 18 | import sys 19 | import os 20 | module_name = str(os.path.basename(sys.modules['__main__'].__file__)).split('.')[0] 21 | 22 | logger = logging.getLogger(module_name) 23 | logger.setLevel(numeric_level) 24 | # create file handler which logs even debug messages 25 | fh = logging.FileHandler(c.logs_folder + module_name + '.log', mode=('a' if append else 'w')) 26 | fh.setLevel(logging.DEBUG) 27 | # create console handler with a higher log level 28 | ch = logging.StreamHandler() 29 | ch.setLevel(logging.DEBUG) 30 | # create formatter and add it to the handlers 31 | formatter = logging.Formatter('%(asctime)s\t%(name)s\t%(levelname)s\t\t%(message)s') 32 | fh.setFormatter(formatter) 33 | ch.setFormatter(formatter) 34 | # add the handlers to the logger 35 | logger.addHandler(fh) 36 | logger.addHandler(ch) 37 | logger.info("Logger created!") 38 | return logger -------------------------------------------------------------------------------- /machine_learning/Get_normalize_data.py: -------------------------------------------------------------------------------- 1 | 2 | import csv 3 | import config as c 4 | import numpy as np 5 | """ 6 | featuresname_all = [ 7 | "number_of_flows", 8 | "average_of_duration", 9 | "standard_deviation_duration", 10 | "percent_of_standard_deviation_duration", 11 | "total_size_of_flows_orig", 12 | "total_size_of_flows_resp", 13 | "ratio_of_sizes", 14 | "percent_of_established_states", 15 | "inbound_pckts", 16 | "outbound_pckts", 17 | "periodicity_average", 18 | "periodicity_standart_deviation", 19 | "ssl_ratio", 20 | "average_public_key", 21 | "tls_version_ratio", 22 | "average_of_certificate_length", 23 | "standart_deviation_cert_length", 24 | "is_valid_certificate_during_capture", 25 | "amount_diff_certificates", 26 | "number_of_domains_in_certificate", 27 | "get_certificate_ratio", 28 | "number_of_certificate_path", 29 | "x509_ssl_ratio", 30 | "SNI_ssl_ratio", 31 | "self_signed_ratio", 32 | "is_SNIs_in_SNA_dns", 33 | "SNI_equal_DstIP", 34 | "is_CNs_in_SNA_dns", 35 | "ratio_of_differ_SNI_in_ssl_log", 36 | "ratio_of_differ_subject_in_ssl_log", 37 | "ratio_of_differ_issuer_in_ssl_log", 38 | "ratio_of_differ_subject_in_cert", 39 | "ratio_of_differ_issuer_in_cert", 40 | "ratio_of_differ_sandns_in_cert", 41 | "ratio_of_same_subjects", 42 | "ratio_of_same_issuer", 43 | "ratio_is_same_CN_and_SNI", 44 | "average_certificate_exponent", 45 | "is_SNI_in_top_level_domain", 46 | "ratio_certificate_path_error", 47 | "ratio_missing_cert_in_cert_path", 48 | "in_alexa_top100", 49 | "in_alexa_top1k", 50 | "in_alexa_top10k", 51 | "in_alexa_top100k", 52 | "in_alexa_top1m", 53 | "not_in_alexa", 54 | "FQDN_length", 55 | "domain_name_length", 56 | "number_of_numerical_chars", 57 | "number_of_non_alphanumeric_chars", 58 | "number_unique_IP_addresses_in_response", 59 | "number_of_subdomains", 60 | "average_ttls", 61 | "std_ttls", 62 | "min_ttls", 63 | "max_ttls", 64 | "number_of_hyphens_in_fqdn", 65 | "length_of_longest_subdomain_name", 66 | "number_of_voyels_in_fqdn", 67 | "number_of_different_chars_in_fqdn", 68 | "number_of_consonants_in_fqdn", 69 | "shannon_entropy_2ld", 70 | "shannon_entropy_3ld"] 71 | """ 72 | 73 | less_important_features = [ 74 | "SNI_equal_DstIP", 75 | "ratio_of_differ_issuer_in_cert", 76 | "ratio_certificate_path_error", 77 | "ratio_missing_cert_in_cert_path", 78 | "standart_deviation_cert_length", 79 | "ratio_of_differ_subject_in_cert", 80 | "percent_of_established_states", 81 | "ratio_of_differ_issuer_in_ssl_log", 82 | "ratio_of_differ_subject_in_ssl_log", 83 | "is_SNI_in_top_level_domain", 84 | 85 | "ratio_of_same_issuer", 86 | "ratio_of_differ_sandns_in_cert", 87 | "in_alexa_top100k", 88 | "tls_version_ratio", 89 | "is_SNIs_in_SNA_dns", 90 | "in_alexa_top10k", 91 | "average_public_key", 92 | "number_of_hyphens_in_fqdn", 93 | "ratio_of_same_subjects", 94 | "average_certificate_exponent", 95 | 96 | "in_alexa_top1k", 97 | "is_CNs_in_SNA_dns", 98 | "amount_diff_certificates", 99 | "number_of_voyels_in_fqdn", 100 | "ssl_ratio", 101 | "in_alexa_top1m", 102 | "in_alexa_top100", 103 | "number_of_non_alphanumeric_chars", 104 | "x509_ssl_ratio", 105 | "number_of_flows", 106 | 107 | "periodicity_standart_deviation", 108 | "SNI_ssl_ratio", 109 | "length_of_longest_subdomain_name", 110 | "FQDN_length", 111 | "number_of_domains_in_certificate", 112 | "number_of_different_chars_in_fqdn", 113 | "percent_of_standard_deviation_duration", 114 | "domain_name_length", 115 | "ratio_is_same_CN_and_SNI", 116 | "number_of_certificate_path" 117 | ] 118 | 119 | def read_features(filename, features_name): 120 | import pandas as pd 121 | X = pd.read_csv(filename) 122 | return X[features_name] 123 | 124 | 125 | def read_labels(filename): 126 | with open(filename, 'r') as csvfile: 127 | csvreader = csv.reader(csvfile, lineterminator='\n', delimiter=',', quoting=csv.QUOTE_NONNUMERIC) 128 | y = csvreader.next() 129 | return y 130 | 131 | 132 | def get_features_name(): 133 | with open(c.model_folder + "features.csv", 'r') as csvfile: 134 | csvreader = csv.reader(csvfile, lineterminator='\n', delimiter=',', quoting=csv.QUOTE_NONNUMERIC) 135 | headers = csvreader.next() 136 | return headers[1:-1] 137 | 138 | 139 | def get_all_data(models_folder, set_name="all"): 140 | featuresname_all = get_features_name() 141 | features_set = { 142 | "all": featuresname_all[:63+1], 143 | "https": featuresname_all[:41], 144 | "dns": featuresname_all[41:63+1], 145 | "reduced": filter(lambda f: f not in less_important_features[:20], featuresname_all[:63+1]), 146 | "reduced_30": filter(lambda f: f not in less_important_features[:30], featuresname_all[:63+1]), 147 | "reduced_40": filter(lambda f: f not in less_important_features[:40], featuresname_all[:63+1]), 148 | "enhanced_30": filter(lambda f: f not in less_important_features[:30], featuresname_all) 149 | } 150 | 151 | X_train = read_features(models_folder + "X_train.csv", features_set[set_name]) 152 | X_test = read_features(models_folder + "X_test.csv", features_set[set_name]) 153 | y_train = read_labels(models_folder + "y_train.csv") 154 | y_test = read_labels(models_folder + "y_test.csv") 155 | #return np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test) 156 | return X_train, X_test, y_train, y_test -------------------------------------------------------------------------------- /machine_learning/__init.py__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lminy/BotnetDetectionThesis/5a54541229a6d7255f0eebe65aaf8b1c35b9be04/machine_learning/__init.py__.py -------------------------------------------------------------------------------- /machine_learning/features_selection.py: -------------------------------------------------------------------------------- 1 | import Get_normalize_data 2 | import config as c 3 | from logger import get_logger 4 | 5 | # https://chrisalbon.com/machine_learning/feature_selection/anova_f-value_for_feature_selection/ 6 | 7 | 8 | def compare_quantitative_features(X, y): # ANOVA F-value 9 | from sklearn.feature_selection import SelectKBest 10 | from sklearn.feature_selection import f_classif 11 | 12 | # Create an SelectKBest object to select features with two best ANOVA F-Values 13 | fvalue_selector = SelectKBest(f_classif, k=10) 14 | 15 | # Apply the SelectKBest object to the features and target 16 | X_kbest = fvalue_selector.fit_transform(X, y) 17 | 18 | print fvalue_selector.scores_ 19 | 20 | 21 | if __name__ == '__main__': 22 | logger = get_logger("debug") 23 | 24 | X_train, X_test, y_train, y_test = Get_normalize_data.get_all_data(c.model_folder) 25 | 26 | compare_quantitative_features(X_train, y_train) 27 | 28 | -------------------------------------------------------------------------------- /machine_learning/model.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from sklearn.model_selection import RandomizedSearchCV, GridSearchCV 3 | import main_tools 4 | import math 5 | import time 6 | import pickle 7 | 8 | from logger import get_logger 9 | logger = get_logger("debug", True) 10 | 11 | 12 | class Model(object): 13 | 14 | def __init__(self, name, classifier, param_grid=None): 15 | self.classifier = classifier 16 | self.name = name 17 | self.param_grid = param_grid 18 | 19 | self.tn = self.fp = self.fn = self.tp = -1 20 | self.metrics = OrderedDict() 21 | 22 | self.training_error = None 23 | self.is_trained = False 24 | 25 | self.score = None 26 | self.y_pred = None 27 | 28 | def train(self, X_train, y_train, random=False): 29 | if self.param_grid is not None and random is False: 30 | self.classifier = GridSearchCV(self.classifier, self.param_grid, cv=10, scoring='precision', n_jobs=-1) # Do a 10-fold cross validation 31 | elif self.param_grid is not None and random is True: 32 | self.classifier = RandomizedSearchCV(self.classifier, param_distributions=self.param_grid, 33 | n_iter=10, scoring='precision', 34 | n_jobs=-1, cv=10, verbose=3, random_state=1001) 35 | 36 | logger.info('Training classifier {}'.format(self.name)) 37 | main_tools.benchmark(self.classifier.fit, X_train, y_train) # fit the classifier with data 38 | logger.info('Trained classifier {}'.format(self.name)) 39 | self.training_error = self.classifier.score(X_train, y_train) 40 | 41 | if self.param_grid is not None: 42 | logger.debug("Grid search best score = {}".format(self.classifier.best_score_)) 43 | logger.debug("Grid search best estimator = {}".format(self.classifier.best_estimator_)) 44 | logger.debug("Grid search cv results = {}".format(self.classifier.cv_results_)) 45 | else: 46 | logger.debug("Model parameters = {}".format(self.classifier.get_params())) 47 | self.is_trained = True 48 | 49 | def predict(self, X_test, y_test): 50 | if not self.is_trained: 51 | raise Exception('Model not trained, please run train()') 52 | 53 | self.score = self.classifier.score(X_test, y_test) 54 | self.y_pred = [round(value) for value in self.classifier.predict(X_test)] # Call predict on the estimator (with the best found parameters if Grid search). 55 | # Round is there is we have probabilities (like with XGBoost) 56 | 57 | def compute_metrics(self, y_test): 58 | if self.y_pred is None: 59 | raise Exception('No prediction found, please run predict()') 60 | 61 | from sklearn import metrics 62 | tn, fp, fn, tp = metrics.confusion_matrix(y_test, self.y_pred, labels=[0,1]).ravel() 63 | self.tn, self.fp, self.fn, self.tp = tn, fp, fn, tp 64 | 65 | logger.debug("tn={}, fp={}, fn={}, tp={}".format(tn, fp, fn, tp)) 66 | 67 | tpr = -1 if tp <= 0 else float(tp) / (tp + fn) 68 | self.metrics["TPR"] = tpr # True Positive Rate 69 | 70 | tnr = -1 if tn <= 0 else float(tn) / (fp + tn) 71 | self.metrics["TNR"] = tnr # True Negative Rate 72 | 73 | fpr = -1 if tn <= 0 else float(fp) / (fp + tn) 74 | self.metrics["FPR"] = fpr # False Positive Rate 75 | 76 | #fdr = -1 if tp <= 0 else float(fp) / (fp + tp) 77 | #self.metrics["FDR"] = fdr # False Discovery Rate 78 | 79 | accuracy = -1 if tp <= 0 or tn <= 0 else float(tp + tn) / (tp + tn + fp + fn) 80 | self.metrics["Acc"] = accuracy 81 | 82 | error_rate = -1 if tp <= 0 or tn <= 0 else float(fp + fn) / (tp + fn + fp + tn) 83 | self.metrics["Err"] = error_rate 84 | 85 | precision = -1 if tp <= 0 else float(tp) / (tp + fp) 86 | self.metrics["Pre"] = precision 87 | 88 | f_measure = -1 if precision <= 0 else float(2 * precision * tpr) / (precision + tpr) 89 | self.metrics["F-M"] = f_measure 90 | 91 | mcc = -1 if tp <= 0 or tn <= 0 else float(tp * tn - fp * fn) / \ 92 | math.sqrt(float(tp + fn) * (tp + fp) * (tn + fp) * (tn + fn)) 93 | self.metrics["MCC"] = mcc # Matthew Correlation Coefficient 94 | 95 | roc_fpr, roc_tpr, thresholds = metrics.roc_curve(y_test, self.y_pred) 96 | self.metrics["AUC"] = metrics.auc(roc_fpr, roc_tpr) 97 | 98 | def get_printable_metrics(self): 99 | if len(self.metrics) == 0: 100 | raise Exception('No metrics found, please run compute_metrics()') 101 | 102 | """ 103 | from prettytable import PrettyTable 104 | import operator 105 | 106 | headers = ['Model', 'Best score'] 107 | headers += self.metrics.keys() 108 | 109 | table = PrettyTable(headers) 110 | content = [self.name, self.score] 111 | content += [round(float(m), 3) for m in self.metrics.values()] 112 | table.add_row(content) 113 | 114 | return table.get_string(sort_key=operator.itemgetter(2, 1), sortby="Best score", reversesort=True) 115 | """ 116 | 117 | headers = ['Exec time', 'Model', 'Best score'] 118 | headers += self.metrics.keys() 119 | values = time.strftime("%Y-%m-%d_%H-%M-%S") + "\t" + "\t".join([self.name, str(self.score)] + map(str, self.metrics.values())) 120 | return "\t".join(headers) + "\n" + values 121 | 122 | 123 | 124 | @staticmethod 125 | def models_metric_summary(models): 126 | #from prettytable import PrettyTable 127 | #import operator 128 | 129 | headers = ['Model', 'Best score'] 130 | headers += models[0].metrics.keys() 131 | """ 132 | table = PrettyTable(headers) 133 | 134 | for model in models: 135 | if len(model.metrics) == 0: 136 | raise Exception('No metrics found for model "{}", please run compute_metrics()'.format(model.name)) 137 | content = [model.name, model.score] 138 | content += [round(float(m), 3) for m in model.metrics.values()] 139 | table.add_row(content) 140 | 141 | return table.get_string(sort_key=operator.itemgetter(2, 1), sortby="Best score", reversesort=True) 142 | """ 143 | 144 | values = "" 145 | for model in models: 146 | if len(model.metrics) == 0: 147 | raise Exception('No metrics found for model "{}", please run compute_metrics()'.format(model.name)) 148 | 149 | values += "\t".join([model.name, str(model.score)] + map(str,model.metrics.values())) + "\n" 150 | return "\t".join(headers) + "\n" + values 151 | 152 | def save(self, filename): 153 | logger.info("Saving model to {}...".format(filename)) 154 | pickle.dump(self.classifier, open(filename, "wb")) 155 | logger.info("Model saved to {}!".format(filename)) 156 | 157 | def load(self, filename): 158 | logger.info("Loading model from {}...".format(filename)) 159 | self.classifier = pickle.load(open(filename, "rb")) 160 | logger.info("Model loaded from {}!".format(filename)) 161 | 162 | -------------------------------------------------------------------------------- /machine_learning/normalize_and_split.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(0, os.environ['HOME'] + '/BotnetDetectionThesis/') 4 | 5 | from sklearn.model_selection import train_test_split 6 | import config as c 7 | import csv 8 | import pandas as pd 9 | 10 | 11 | def normalize_data(data): 12 | for i in range(0, len(data[0])): 13 | max = 0 14 | for j in range(len(data)): 15 | if max < data[j][i]: 16 | max = data[j][i] 17 | if max != 0: 18 | for j in range(len(data)): 19 | if data[j][i] != -1: 20 | data[j][i] = data[j][i] / float(max) 21 | return data 22 | 23 | 24 | def write_features(filename, data, features_name): 25 | df = pd.DataFrame(data, columns=features_name) 26 | df.to_csv(c.model_folder + filename, sep=',', encoding='utf-8', index=False) 27 | 28 | 29 | def write_targets(file_name, data_list): 30 | index = 0 31 | import csv 32 | 33 | with open(c.model_folder + file_name, 'wb') as csvfile: 34 | writer = csv.writer(csvfile, lineterminator='\n', delimiter=',') # fieldnames=features.keys(), 35 | writer.writerow(data_list) 36 | index += 1 37 | 38 | print file_name, "written lines:", index 39 | 40 | 41 | def transform_label(label): 42 | label_number = -1 43 | if 'MALWARE' in label: 44 | label_number = 1 45 | elif "NORMAL" in label: 46 | label_number = 0 47 | else: 48 | print "The label is incorrect" 49 | 50 | return label_number 51 | 52 | if __name__ == '__main__': 53 | malwares = 0 54 | normals = 0 55 | 56 | X = list() 57 | y = list() 58 | 59 | LIMIT = -1 # total nb_lines, -1 = NO LIMIT 60 | 61 | with open(c.model_folder + "features.csv", 'r') as csvfile: 62 | csvreader = csv.reader(csvfile, lineterminator='\n', delimiter=',', quoting=csv.QUOTE_NONNUMERIC) 63 | headers = csvreader.next() 64 | print(headers) 65 | line_nb = 0 66 | 67 | for row in csvreader: 68 | target = transform_label(row[-1]) 69 | if LIMIT != -1: 70 | if target == 1 and malwares < LIMIT / 2: 71 | malwares += 1 72 | elif target == 0 and normals < LIMIT / 2: 73 | normals += 1 74 | else: 75 | continue 76 | else: 77 | malwares += target 78 | normals += 1 if target == 0 else 0 79 | 80 | X.append(row[1:-1]) # exclude key (index 0) and label (index -1 = last index) 81 | y.append(target) 82 | line_nb += 1 83 | 84 | features_name = headers[1:-1] 85 | 86 | # normalize X 87 | norm_X = normalize_data(X) 88 | print "Size of X:", len(X) 89 | print "Malwares:", malwares 90 | print "Normals:", len(X) - malwares 91 | 92 | # split data by sklearn library 93 | X_train, X_test, y_train, y_test = train_test_split(norm_X, y, test_size=.2, random_state=35) 94 | 95 | # Write train data 96 | write_features('X_train.csv', X_train, features_name) 97 | write_targets('y_train.csv', y_train) 98 | 99 | # Write test data 100 | write_features('X_test.csv', X_test, features_name) 101 | write_targets('y_test.csv', y_test) 102 | -------------------------------------------------------------------------------- /machine_learning/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(0, os.environ['HOME'] + '/BotnetDetectionThesis/') 4 | 5 | 6 | from sklearn.neighbors import KNeighborsClassifier 7 | from sklearn.naive_bayes import MultinomialNB 8 | from sklearn.naive_bayes import BernoulliNB 9 | from sklearn.naive_bayes import GaussianNB 10 | from sklearn.ensemble import RandomForestClassifier 11 | from sklearn.ensemble import AdaBoostClassifier 12 | from sklearn.linear_model import LogisticRegression 13 | from xgboost import XGBClassifier 14 | from sklearn import svm 15 | from sklearn import tree 16 | 17 | import numpy as np 18 | import Get_normalize_data 19 | import config as c 20 | from logger import get_logger 21 | from model import Model 22 | 23 | 24 | def select_models(models, models_name): 25 | return [m for m in models if m.name in models_name] 26 | 27 | 28 | def final_train(models, set_name): 29 | with open(c.training_output_file, 'a') as f: 30 | f.write("Features set : " + set_name + "\n") 31 | for model in models: 32 | model.train(X_train, y_train) 33 | model.predict(X_test, y_test) 34 | model.compute_metrics(y_test) 35 | logger.debug(model.get_printable_metrics()) 36 | with open(c.training_output_file, 'a') as f: 37 | f.write(model.get_printable_metrics() + "\n") 38 | 39 | logger.info("Features set : " + set_name) 40 | logger.info(Model.models_metric_summary(models)) 41 | 42 | 43 | def train(model, set_name, random=False): 44 | model.train(X_train, y_train, random) 45 | model.predict(X_test, y_test) 46 | model.compute_metrics(y_test) 47 | logger.debug(model.get_printable_metrics()) 48 | model.save(c.model_folder + model.name + ".model") 49 | with open(c.training_output_file, 'a') as f: 50 | f.write("Features set : " + set_name + "\n") 51 | f.write(model.get_printable_metrics() + "\n") 52 | logger.info("Features set : " + set_name) 53 | logger.info(model.get_printable_metrics()) 54 | 55 | 56 | if __name__ == '__main__': 57 | logger = get_logger("debug", append=True) 58 | 59 | models = list() 60 | 61 | #k-NN Classifier 62 | name = "k-NN" 63 | classifier = KNeighborsClassifier(weights='uniform', n_jobs=-1) 64 | k_range = list(range(1, 31)) # list of parameter values to test 65 | param_grid = dict(n_neighbors=k_range) 66 | models.append(Model(name, classifier, param_grid)) 67 | 68 | #Decision Tree 69 | name = "Decision tree" 70 | classifier = tree.DecisionTreeClassifier(criterion='entropy') 71 | d_range = list(range(1, 31)) # list of parameter values to test 72 | #s_range = list(range(2, 10)) 73 | param_grid = dict(max_depth=d_range)#, min_samples_split=s_range) 74 | models.append(Model(name, classifier, param_grid)) 75 | 76 | #Random Forest 77 | name = "Random forest" 78 | classifier = RandomForestClassifier(n_jobs=-1) 79 | d_range = list(range(1, 31)) # list of parameter values to test 80 | #s_range = list(range(2, 10)) 81 | param_grid = dict(max_depth=d_range)#, min_samples_split=s_range) 82 | models.append(Model(name, classifier, param_grid)) 83 | 84 | #Naive Bayes 85 | name = "NB - Gaussian" 86 | classifier = GaussianNB() 87 | gnb = Model(name, classifier) 88 | models.append(gnb) 89 | 90 | #AdaBoost 91 | name = "AdaBoost" 92 | classifier = AdaBoostClassifier(n_estimators=100) 93 | adaboost = Model(name, classifier) 94 | models.append(adaboost) 95 | 96 | #Logistic Regression 97 | name = "Log. Regression" 98 | classifier = LogisticRegression() 99 | param_grid = dict(C=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]) 100 | log_reg = Model(name, classifier, param_grid) 101 | models.append(log_reg) 102 | 103 | name = "Log. Reg l1" 104 | classifier = LogisticRegression(penalty='l1') 105 | param_grid = dict(C=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]) 106 | log_reg_l1 = Model(name, classifier, param_grid) 107 | models.append(log_reg_l1) 108 | 109 | 110 | #Neural networks 111 | from sklearn.neural_network import MLPClassifier 112 | name = "Neural net" 113 | #classifier = MLPClassifier(alpha=1) 114 | #classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) 115 | classifier = MLPClassifier(solver='adam', alpha=1e-5, random_state=1) # from Strasak thesis 116 | nn = Model(name, classifier) 117 | models.append(nn) 118 | 119 | # SVM - Support Vector Machine 120 | name = "SVM - SVC" 121 | classifier = svm.SVC() 122 | C_range = np.logspace(-2, 10, 13) 123 | # print(C_range) 124 | gamma_range = np.logspace(-9, 3, 13) 125 | # print(gamma_range) 126 | param_grid = dict(gamma=gamma_range, C=C_range) 127 | models.append(Model(name, classifier, param_grid)) 128 | 129 | name = "SVM - Linear" 130 | classifier = svm.LinearSVC() 131 | #C_range = range(1,200,50) 132 | C_range = range(1,200,50) 133 | param_grid = dict(C=C_range) 134 | models.append(Model(name, classifier, param_grid)) 135 | 136 | name = "NB - Multinomial" 137 | classifier = MultinomialNB() 138 | models.append(Model(name, classifier)) 139 | 140 | name = "NB - Bernoulli" 141 | classifier = BernoulliNB() 142 | models.append(Model(name, classifier)) 143 | 144 | 145 | name = "XGBoost 1" 146 | classifier = XGBClassifier( 147 | learning_rate =0.1, 148 | n_estimators=1000, 149 | max_depth=10, 150 | min_child_weight=1, 151 | gamma=0, 152 | subsample=0.8, 153 | colsample_bytree=0.8, 154 | objective= 'binary:logistic', 155 | nthread=4, 156 | scale_pos_weight=1, 157 | seed=3) 158 | models.append(Model(name,classifier)) 159 | 160 | name = "XGBoost 2" 161 | classifier = XGBClassifier( 162 | learning_rate=0.1, 163 | n_estimators=1000, 164 | max_depth=3, 165 | min_child_weight=5, 166 | gamma=0.1, 167 | subsample=0.8, 168 | colsample_bytree=0.8, 169 | objective='binary:logistic', 170 | nthread=4, 171 | scale_pos_weight=1, 172 | seed=27) 173 | models.append(Model(name, classifier)) 174 | 175 | name = "XGBoost" 176 | classifier = XGBClassifier( 177 | learning_rate=0.1, 178 | n_estimators=1000, 179 | objective='binary:logistic', 180 | nthread=4, 181 | scale_pos_weight=1, 182 | seed=27) 183 | param_grid = { 184 | 'min_child_weight': [1, 5, 10], 185 | 'gamma': [0.5, 1, 1.5, 2, 5], 186 | 'subsample': [0.6, 0.8, 1.0], 187 | 'colsample_bytree': [0.6, 0.8, 1.0], 188 | 'max_depth': [3, 4, 5] 189 | } 190 | xgboost = Model(name, classifier, param_grid) 191 | models.append(xgboost) 192 | 193 | name = "Tuned XGBoost" 194 | classifier = XGBClassifier( 195 | learning_rate=0.1, 196 | n_estimators=1000, 197 | objective='binary:logistic', 198 | nthread=4, 199 | scale_pos_weight=1, 200 | seed=27) 201 | param_grid = { 202 | 'max_depth': range(3,10,2), 203 | 'min_child_weight': range(1,6,2), 204 | 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 1.5, 2, 5], 205 | 'subsample': [i/10.0 for i in range(5,11)], 206 | 'colsample_bytree': [i/10.0 for i in range(5,11)], 207 | 'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100] 208 | } 209 | tuned_xgboost = Model(name, classifier, param_grid) 210 | models.append(tuned_xgboost) 211 | 212 | 213 | name = "XGBoostBest" 214 | classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, 215 | colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, 216 | max_delta_step=0, max_depth=5, min_child_weight=1, missing=None, 217 | n_estimators=1000, n_jobs=1, nthread=4, objective='binary:logistic', 218 | random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, 219 | seed=27, silent=True, subsample=0.8) 220 | xgboost_best = Model(name, classifier) 221 | 222 | 223 | #all_models = models.keys() 224 | models_to_train = ['XGBoost', 'k-NN', 'Decision tree', 'Random forest', 'NB - Gaussian','AdaBoost', 'Log. Regression', 'Neural net'] #, 'SVM - SVC'] 225 | 226 | # set_name can be: all, dns, https, reduced, reduced_30, reduced_40, enhanced_30 227 | set_name = "enhanced_30" 228 | X_train, X_test, y_train, y_test = Get_normalize_data.get_all_data(c.model_folder, set_name) 229 | 230 | #final_train(select_models(models, models_to_train), set_name) 231 | 232 | train(tuned_xgboost, set_name, random=False) 233 | 234 | #train(log_reg_l1, set_name) 235 | 236 | #train(log_reg, set_name) 237 | 238 | #train(xgboost_best, set_name) 239 | 240 | #train(xgboost, set_name, False) 241 | 242 | #train(gnb, set_name) 243 | 244 | 245 | -------------------------------------------------------------------------------- /main_tools.py: -------------------------------------------------------------------------------- 1 | from logger import get_logger 2 | logger = get_logger("debug") 3 | 4 | 5 | def benchmark(func, *params): 6 | import datetime 7 | import time 8 | start_time = time.time() 9 | return_value = func(*params) if params else func() 10 | total_time = datetime.timedelta(seconds=time.time() - start_time) 11 | logger.debug("Function {} - execution time : {}".format(func.__name__, total_time)) 12 | return return_value 13 | -------------------------------------------------------------------------------- /statistics/__init.py__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lminy/BotnetDetectionThesis/5a54541229a6d7255f0eebe65aaf8b1c35b9be04/statistics/__init.py__.py -------------------------------------------------------------------------------- /statistics/datasets_statistics.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(0, os.environ['HOME'] + '/BotnetDetectionThesis/') 4 | import config as c 5 | 6 | 7 | def get_size_folder(start_path = '.'): 8 | total_size = 0 9 | for dirpath, dirnames, filenames in os.walk(start_path): 10 | for f in filenames: 11 | fp = os.path.join(dirpath, f) 12 | total_size += os.path.getsize(fp) 13 | return total_size 14 | 15 | # https://stackoverflow.com/a/1094933 16 | def sizeof_fmt(num, suffix='B'): 17 | for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']: 18 | if abs(num) < 1024.0: 19 | return "%3.1f%s%s" % (num, unit, suffix) 20 | num /= 1024.0 21 | return "%.1f%s%s" % (num, 'Yi', suffix) 22 | 23 | def compute_malware_normal_packets(path_conn_label): 24 | normals = 0 25 | malwares = 0 26 | with open(path_conn_label) as f: 27 | for line in f: 28 | if line[0] == '#': 29 | continue 30 | split_conn_line = line.split('\t') 31 | 32 | if len(split_conn_line) < 22: 33 | continue 34 | 35 | label = split_conn_line[21] 36 | 37 | if 'From-Normal' in label: 38 | normals += 1 39 | elif 'From-Botnet' in label: 40 | malwares += 1 41 | 42 | return (normals, malwares) 43 | 44 | 45 | size_normal_dataset = 0 46 | size_ctu13_malware_dataset = 0 47 | size_other_malware_dataset = 0 48 | 49 | normal_dataset_normal_packets = 0 50 | ctu13_malware_dataset_normal_packets = 0 51 | other_malware_dataset_normal_packets = 0 52 | 53 | normal_dataset_malware_packets = 0 54 | ctu13_malware_dataset_malware_packets = 0 55 | other_malware_dataset_malware_packets = 0 56 | 57 | 58 | index = 0 59 | for sub_set in os.listdir(c.datasets_folder): 60 | if sub_set.startswith(".") or not os.path.exists(c.datasets_folder + sub_set + '/bro/ssl.log'): 61 | continue 62 | print "--------------------------------------------------------" 63 | print "-------- #" + str(index) + " " + sub_set 64 | print "--------------------------------------------------------" 65 | 66 | 67 | dataset_bro_folder = c.datasets_folder + sub_set + '/bro/' 68 | dataset_size = get_size_folder(dataset_bro_folder) 69 | print "Size of dataset : " + str(sizeof_fmt(dataset_size)) 70 | index += 1 71 | 72 | normals, malwares = compute_malware_normal_packets(dataset_bro_folder + 'conn_label.log') 73 | 74 | if sub_set.startswith("CTU-Normal-"): 75 | size_normal_dataset += dataset_size 76 | normal_dataset_normal_packets += normals 77 | normal_dataset_malware_packets += malwares 78 | elif sub_set.startswith("CTU-Malware-Capture-Botnet-") and 42 <= int(sub_set.split('-')[4]) <= 54: 79 | size_ctu13_malware_dataset += dataset_size 80 | ctu13_malware_dataset_normal_packets += normals 81 | ctu13_malware_dataset_malware_packets += malwares 82 | elif sub_set.startswith("CTU-Malware-Capture-Botnet-"): 83 | size_other_malware_dataset += dataset_size 84 | other_malware_dataset_normal_packets += normals 85 | other_malware_dataset_malware_packets +=malwares 86 | 87 | 88 | print "Normal packets: " + str(normals) 89 | print "Malware packets : " + str(malwares) 90 | 91 | 92 | 93 | print "\n\n============================" 94 | print "Size normal datasets : " + str(sizeof_fmt(size_normal_dataset)) 95 | print "\t>>> Normal packets : " + str(normal_dataset_normal_packets) 96 | print "\t>>> Malware packets : " + str(normal_dataset_malware_packets) 97 | print "Size CTU-13 malware datasets : " + str(sizeof_fmt(size_ctu13_malware_dataset)) 98 | print "\t>>> Normal packets : " + str(ctu13_malware_dataset_normal_packets) 99 | print "\t>>> Malware packets : " + str(ctu13_malware_dataset_malware_packets) 100 | print "Size other malware datasets : " + str(sizeof_fmt(size_other_malware_dataset)) 101 | print "\t>>> Normal packets : " + str(other_malware_dataset_normal_packets) 102 | print "\t>>> Malware packets : " + str(other_malware_dataset_malware_packets) 103 | print "\n------------------" 104 | print " TOTAL Datasets" 105 | print "------------------" 106 | print "Total Size : " + str(sizeof_fmt(size_normal_dataset + size_ctu13_malware_dataset + size_other_malware_dataset)) 107 | print "Total normal packets : " + str(normal_dataset_normal_packets + ctu13_malware_dataset_normal_packets + other_malware_dataset_normal_packets) 108 | print "Total malwares packets : " + str(normal_dataset_malware_packets + ctu13_malware_dataset_malware_packets + other_malware_dataset_malware_packets) 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /statistics/dns_features_statistics.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import config as c 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import matplotlib 7 | from collections import Counter 8 | matplotlib.use('TkAgg') 9 | import matplotlib.pyplot as plt 10 | 11 | df = pd.read_csv(c.model_folder + 'features.csv') 12 | 13 | 14 | names = ["in_alexa_top100","in_alexa_top1k","in_alexa_top10k","in_alexa_top100k","in_alexa_top1m","not_in_alexa", 15 | "FQDN_length","domain_name_length","number_of_numerical_chars","number_of_non_alphanumeric_chars", 16 | "number_unique_IP_addresses_in_response","number_of_subdomains","average_ttls","min_ttls", 17 | "max_ttls","number_of_hyphens_in_fqdn","length_of_longest_subdomain_name","number_of_voyels_in_fqdn", 18 | "number_of_different_chars_in_fqdn","number_of_consonants_in_fqdn", 19 | "shannon_entropy_2ld","shannon_entropy_3ld","label"] 20 | 21 | #print(type(normal.values)) 22 | #print v.ndim 23 | 24 | #df.plot(kind='bar', stacked=True); 25 | 26 | """ 27 | shannon_entropy_2ld_botnet = list() 28 | shannon_entropy_2ld_normal = list() 29 | 30 | with open(c.model_folder + 'features.csv') as csvfile: 31 | reader = csv.DictReader(csvfile) 32 | for row in reader: 33 | print row['key'], row['number_of_flows'] 34 | if row['label'] == 'MALWARE': 35 | shannon_entropy_2ld_botnet.append(float(row['shannon_entropy_2ld'])) 36 | else: 37 | shannon_entropy_2ld_normal.append(float(row['shannon_entropy_2ld'])) 38 | 39 | """ 40 | 41 | import matplotlib 42 | matplotlib.use('TkAgg') 43 | import matplotlib.pyplot as plt 44 | 45 | 46 | def compute_stat_continue(feature_name, data): 47 | labels = ["Normal", "Botnet"] 48 | fig, ax = plt.subplots() 49 | ax.set_title('Feature ' + feature_name) 50 | ax.boxplot(data, labels=labels) 51 | 52 | fig.savefig(c.graphs_folder + feature_name + '.png') 53 | 54 | 55 | features_continue = ["FQDN_length","domain_name_length","number_of_numerical_chars","number_of_non_alphanumeric_chars", 56 | "number_unique_IP_addresses_in_response","number_of_subdomains","average_ttls","min_ttls", 57 | "max_ttls","number_of_hyphens_in_fqdn","length_of_longest_subdomain_name","number_of_voyels_in_fqdn", 58 | "number_of_different_chars_in_fqdn","number_of_consonants_in_fqdn", 59 | "shannon_entropy_2ld","shannon_entropy_3ld"] 60 | 61 | 62 | def plot_all_stat_continue(): 63 | for feature_name in features_continue: 64 | normal = df.loc[df['label'] == 'NORMAL'][feature_name] 65 | malware = df.loc[df['label'] == 'MALWARE'][feature_name] 66 | data = [normal, malware] 67 | compute_stat_continue(feature_name, data) 68 | 69 | def plot_alexa(): 70 | # Example https://matplotlib.org/2.0.2/examples/api/barchart_demo.html 71 | 72 | features_alexa = ["in_alexa_top100", "in_alexa_top1k", "in_alexa_top10k", "in_alexa_top100k", "in_alexa_top1m", 73 | "not_in_alexa"] 74 | features_names_siplified = ["top 100", "top 1k", "top 10k", "top 100k", " top 1m", "not"] 75 | 76 | normal_means = list() 77 | normal_std = list() 78 | 79 | malware_means = list() 80 | malware_std = list() 81 | for i in range(len(features_alexa)): 82 | normal = df.loc[df['label'] == 'NORMAL'][features_alexa[i]] 83 | malware = df.loc[df['label'] == 'MALWARE'][features_alexa[i]] 84 | normal_means.append(np.mean(normal)) 85 | normal_std.append(np.std(normal)) 86 | malware_means.append(np.mean(malware)) 87 | malware_std.append(np.std(malware)) 88 | 89 | 90 | 91 | N = len(features_alexa) 92 | men_means = (20, 35, 30, 35, 27) 93 | men_std = (2, 3, 4, 1, 2) 94 | 95 | ind = np.arange(N) # the x locations for the groups 96 | width = 0.35 # the width of the bars 97 | 98 | fig, ax = plt.subplots() 99 | rects1 = ax.bar(ind, normal_means, width, color='g', yerr=normal_std) 100 | 101 | women_means = (25, 32, 34, 20, 25) 102 | women_std = (3, 5, 2, 3, 3) 103 | rects2 = ax.bar(ind + width, malware_means, width, color='r', yerr=malware_std) 104 | 105 | # add some text for labels, title and axes ticks 106 | ax.set_ylabel('Scores') 107 | ax.set_title('Scores by group and gender') 108 | ax.set_xticks(ind + width / 2) 109 | ax.set_xticklabels(features_names_siplified) 110 | 111 | ax.legend((rects1[0], rects2[0]), ('Normal', 'Botnet')) 112 | fig.savefig(c.graphs_folder + "features_alexa" + '.png') 113 | 114 | def plot_alexa2(): 115 | # Example https://matplotlib.org/2.0.2/examples/api/barchart_demo.html 116 | 117 | features_alexa = ["in_alexa_top100", "in_alexa_top1k", "in_alexa_top10k", "in_alexa_top100k", "in_alexa_top1m", 118 | "not_in_alexa"] 119 | features_names_simplified = ["top 100", "top 1k", "top 10k", "top 100k", " top 1m", "not"] 120 | 121 | normal_percentage = list() 122 | malware_percentage = list() 123 | for i in range(len(features_alexa)): 124 | normal = df.loc[df['label'] == 'NORMAL'][features_alexa[i]] 125 | malware = df.loc[df['label'] == 'MALWARE'][features_alexa[i]] 126 | normal_percentage.append(sum(normal) / float(len(df))) 127 | malware_percentage.append(sum(malware) / float(len(df))) 128 | 129 | N = len(features_alexa) 130 | 131 | ind = np.arange(N) # the x locations for the groups 132 | width = 0.35 # the width of the bars 133 | 134 | fig, ax = plt.subplots() 135 | rects1 = ax.bar(ind, normal_percentage, width, color='g') 136 | 137 | rects2 = ax.bar(ind + width, malware_percentage, width, color='r') 138 | 139 | # add some text for labels, title and axes ticks 140 | ax.set_ylabel('Percentage of flows') 141 | ax.set_title('Flows in Top alexa') 142 | ax.set_xticks(ind + width / 2) 143 | ax.set_xticklabels(features_names_simplified) 144 | 145 | ax.legend((rects1[0], rects2[0]), ('Normal', 'Botnet')) 146 | fig.savefig(c.graphs_folder + "features_alexa2" + '.png') 147 | 148 | 149 | def plot_barchar(features_name): 150 | # Example https://matplotlib.org/2.0.2/examples/api/barchart_demo.html 151 | 152 | normal = df.loc[df['label'] == 'NORMAL'][features_name] 153 | malware = df.loc[df['label'] == 'MALWARE'][features_name] 154 | c = Counter(normal) 155 | print c.most_common(15) 156 | return 157 | 158 | 159 | 160 | features_alexa = ["in_alexa_top100", "in_alexa_top1k", "in_alexa_top10k", "in_alexa_top100k", "in_alexa_top1m", 161 | "not_in_alexa"] 162 | features_names_simplified = ["top 100", "top 1k", "top 10k", "top 100k", " top 1m", "not"] 163 | 164 | normal_means = list() 165 | normal_std = list() 166 | 167 | malware_means = list() 168 | malware_std = list() 169 | for i in range(len(features_alexa)): 170 | normal = df.loc[df['label'] == 'NORMAL'][features_alexa[i]] 171 | malware = df.loc[df['label'] == 'MALWARE'][features_alexa[i]] 172 | normal_means.append(np.mean(normal)) 173 | normal_std.append(np.std(normal)) 174 | malware_means.append(np.mean(malware)) 175 | malware_std.append(np.std(malware)) 176 | 177 | 178 | 179 | N = len(features_alexa) 180 | men_means = (20, 35, 30, 35, 27) 181 | men_std = (2, 3, 4, 1, 2) 182 | 183 | ind = np.arange(N) # the x locations for the groups 184 | width = 0.35 # the width of the bars 185 | 186 | fig, ax = plt.subplots() 187 | rects1 = ax.bar(ind, normal_means, width, color='g', yerr=normal_std) 188 | 189 | women_means = (25, 32, 34, 20, 25) 190 | women_std = (3, 5, 2, 3, 3) 191 | rects2 = ax.bar(ind + width, malware_means, width, color='r', yerr=malware_std) 192 | 193 | # add some text for labels, title and axes ticks 194 | ax.set_ylabel('Scores') 195 | ax.set_title('Scores by group and gender') 196 | ax.set_xticks(ind + width / 2) 197 | ax.set_xticklabels(features_names_simplified) 198 | 199 | ax.legend((rects1[0], rects2[0]), ('Normal', 'Botnet')) 200 | fig.savefig(c.graphs_folder + "features_alexa" + '.png') 201 | 202 | plot_all_stat_continue() 203 | plot_alexa2() 204 | plot_barchar("number_unique_IP_addresses_in_response") 205 | 206 | nb_flows_normal = len(df.loc[df['label'] == 'NORMAL']) 207 | nb_flows_malware = len(df.loc[df['label'] == 'MALWARE']) 208 | print "Number of flows Normal : " + str(nb_flows_normal) 209 | print "Number of flows Malware : " + str(nb_flows_malware) 210 | print "Total : " + str(nb_flows_normal + nb_flows_malware) 211 | -------------------------------------------------------------------------------- /tools/__init.py__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lminy/BotnetDetectionThesis/5a54541229a6d7255f0eebe65aaf8b1c35b9be04/tools/__init.py__.py -------------------------------------------------------------------------------- /tools/backup_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(0, os.environ['HOME'] + '/BotnetDetectionThesis/') 4 | import shutil 5 | import config as c 6 | import time 7 | from logger import get_logger 8 | 9 | logger = get_logger('debug') 10 | 11 | 12 | def copytree(src, dst, symlinks=False, ignore=None): 13 | for item in os.listdir(src): 14 | s = os.path.join(src, item) 15 | d = os.path.join(dst, item) 16 | if os.path.isdir(s): 17 | shutil.copytree(s, d, symlinks, ignore) 18 | else: 19 | shutil.copy2(s, d) 20 | 21 | 22 | if __name__ == '__main__': 23 | src = c.results_folder 24 | dst = c.results_folder_backup + "result_" + time.strftime("%Y-%m-%d_%H-%M-%S") 25 | 26 | copytree(src, dst) 27 | logger.info("Results backuped to {}".format(dst)) 28 | 29 | 30 | -------------------------------------------------------------------------------- /tools/check_IP.py: -------------------------------------------------------------------------------- 1 | 2 | def is_ipv4(str): 3 | l = str.split('.') 4 | if len(l) != 4: 5 | return False 6 | try: 7 | ip = map(int, l) 8 | except ValueError: 9 | return False 10 | if len(filter(lambda x: 0 <= x <= 255, ip)) == 4: 11 | return True 12 | return False 13 | 14 | # True 15 | print is_ipv4("192.168.1.1") 16 | print is_ipv4("0.0.0.0") 17 | print is_ipv4("255.255.255.255") 18 | 19 | # False 20 | print is_ipv4("255.255.255") 21 | print is_ipv4("255.255.255.255.3") 22 | print is_ipv4("255.255.255.erzr") 23 | 24 | -------------------------------------------------------------------------------- /tools/delete_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(0, os.environ['HOME'] + '/BotnetDetectionThesis/') 4 | 5 | import config as c 6 | import shutil 7 | from logger import get_logger 8 | 9 | logger = get_logger('debug') 10 | 11 | if __name__ == '__main__': 12 | 13 | for folder in os.listdir(c.results_folder): 14 | for file in os.listdir(c.results_folder + folder + "/"): 15 | filename = c.results_folder + folder + "/" + file 16 | os.remove(filename) 17 | logger.info("File deleted : {}".format(filename)) -------------------------------------------------------------------------------- /tools/download_datasets_gdrive.sh: -------------------------------------------------------------------------------- 1 | FILEID=$1 2 | FILENAME=$2 3 | 4 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=$FILEID" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$FILEID" -O $FILENAME && rm -rf /tmp/cookies.txt -------------------------------------------------------------------------------- /tools/entropy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | 4 | 5 | """ 6 | def entropy(vals): 7 | sum = 0.0 8 | norm = 0.0 9 | for v in vals: 10 | norm += v 11 | vals = [v/norm for v in vals] 12 | for v in vals: 13 | sum += (v*np.log(v)) 14 | return -1.0 * sum 15 | 16 | 17 | def entropy(X): 18 | probs = [np.mean(X == c) for c in set(X)] 19 | return np.sum(-p * np.log2(p) for p in probs) 20 | """ 21 | 22 | 23 | def entropy(string): 24 | "Calculates the Shannon entropy of a string" 25 | 26 | # get probability of chars in string 27 | prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))] 28 | 29 | # calculate the entropy 30 | entropy = - sum([p * math.log(p) / math.log(2.0) for p in prob]) 31 | 32 | return entropy 33 | 34 | print entropy("huhjkhuihjilhnuohy.www.google.com") -------------------------------------------------------------------------------- /tools/extract_bro_ciphers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | datasets_folder = "/mnt/hgfs/datasets" 5 | 6 | for dataset_name in os.listdir(datasets_folder): 7 | if dataset_name.startswith("."): 8 | continue 9 | for filename in os.listdir(datasets_folder + dataset_name): 10 | if filename.endswith('.pcap'): 11 | pcap_fullpath = datasets_folder + dataset_name + "/" + filename 12 | print("Extracting ciphers for {}...".format(pcap_fullpath)) 13 | working_dir = datasets_folder + dataset_name + "/bro_ciphers/" 14 | os.mkdir(datasets_folder + dataset_name + "/bro_ciphers/") 15 | 16 | subprocess.Popen(["bro", "-C", "-r", "../"+filename, "-b", "base/protocols/ssl", "site/tls_finger"], cwd=working_dir).wait() -------------------------------------------------------------------------------- /tools/generate_features_table.py: -------------------------------------------------------------------------------- 1 | import config as c 2 | import csv 3 | from collections import OrderedDict 4 | 5 | 6 | def get_features_name(): 7 | with open(c.model_folder + "features.csv", 'r') as csvfile: 8 | csvreader = csv.reader(csvfile, lineterminator='\n', delimiter=',', quoting=csv.QUOTE_NONNUMERIC) 9 | features_name = csvreader.next()[1:-1] 10 | return features_name 11 | 12 | if __name__ == '__main__': 13 | 14 | features_name = get_features_name() 15 | 16 | #latex_table = "\\begin{table}[!h]\n" \ 17 | # "\centering\n" \ 18 | # "\\begin{adjustbox}{max width=\\textwidth}\n" \ 19 | # "\\begin{tabular}{llll}\n" 20 | headers = "\\textbf{{{}}} & \\textbf{{{}}} & \\textbf{{{}}} & \\textbf{{{}}} \\\\ \n\hline \n".format("", "ID", "Feature name", "proposed in") 21 | # latex_table += headers 22 | 23 | # Long table 24 | latex_table = "\\begin{longtable}{llll}\n" 25 | latex_table += headers 26 | latex_table += "\endhead\n" \ 27 | "\endfoot\n" \ 28 | 29 | dns_features_name = features_name[41:] 30 | ordered_dns = OrderedDict().fromkeys(dns_features_name) 31 | dns_features_references = { 32 | 'number_of_different_chars_in_fqdn': "marques2017thesis", 33 | 'number_of_hyphens_in_fqdn': "wang2015breakingbad", 34 | 'shannon_entropy_3ld': "marques2017thesis", 35 | 'number_of_voyels_in_fqdn': "aashna2017dga", 36 | 'in_alexa_top100': "anderson2016identifying", 37 | 'number_of_subdomains': "hao2017exploring", 38 | 'not_in_alexa': "anderson2016identifying", 39 | 'min_ttls': "marques2017thesis", 40 | 'shannon_entropy_2ld': "marques2017thesis", 41 | 'length_of_longest_subdomain_name': "hao2017exploring", 42 | 'in_alexa_top1m': "anderson2016identifying", 43 | 'in_alexa_top1k': "anderson2016identifying", 44 | 'in_alexa_top100k': "anderson2016identifying", 45 | 'average_ttls': None, 46 | 'in_alexa_top10k': "anderson2016identifying", 47 | 'number_of_numerical_chars': "wang2015breakingbad,marques2017thesis,anderson2016identifying", 48 | 'number_unique_IP_addresses_in_response': "marques2017thesis,anderson2016identifying", 49 | 'std_ttls': None, 50 | 'max_ttls': "marques2017thesis", 51 | 'number_of_non_alphanumeric_chars': "anderson2016identifying", 52 | 'FQDN_length': "hao2017exploring,aashna2017dga,anderson2016identifying", 53 | 'number_of_consonants_in_fqdn': "aashna2017dga,marques2017thesis", 54 | 'domain_name_length': "wang2015breakingbad,marques2017thesis,anderson2016identifying"} 55 | ordered_dns.update(dns_features_references) 56 | #for i, feature in enumerate(dns_features_name): 57 | # latex_table += "{} & F{} & {} & {} \\\\ \n".format("",i,feature.replace("_", " "),"") 58 | 59 | i = 41 60 | for f, papers in ordered_dns.iteritems(): 61 | latex_table += "{} & F{} & {} & {} \\\\ \n".format("",i,f.replace("_", " "),"" if papers is None else "\\cite{" + papers + "}") 62 | i += 1 63 | 64 | #latex_table += "\end{tabular}\n" \ 65 | # "\end{adjustbox}\n" \ 66 | # "\caption{DNS Features}\n" \ 67 | # "\label{table:dns_features}\n" \ 68 | # "\end{table}\n" 69 | # Long table 70 | latex_table += "\caption{DNS Features}\label{table:dns_features}\\\\ \n" \ 71 | "\end{longtable}" 72 | 73 | 74 | print latex_table 75 | 76 | 77 | -------------------------------------------------------------------------------- /tools/generate_results_table.py: -------------------------------------------------------------------------------- 1 | import config as c 2 | import csv 3 | import string 4 | 5 | 6 | def generate_table(results, headers, caption, label): 7 | latex_table = "\\begin{table}[!h]\n" \ 8 | "\centering\n" \ 9 | "\\begin{adjustbox}{max width=\\textwidth}\n" \ 10 | "\\begin{tabular}{lllllllllll}\n" 11 | 12 | latex_table += " & ".join(map(lambda h: "\\textbf{{{}}}".format(h), headers)) 13 | latex_table += "\\\\ \hline \n" 14 | 15 | for line in results.split("\n"): 16 | line = map(string.strip, line.split("\t")) 17 | latex_table += line[0] 18 | for e in line[1:]: 19 | latex_table += " & " + str(round(float(e), 3)) 20 | latex_table += "" 21 | latex_table += " \\\\ \n" 22 | 23 | latex_table += "\end{tabular}\n" \ 24 | "\end{adjustbox}\n" 25 | latex_table += "\caption{{{}}}\n".format(caption) 26 | latex_table += "\label{{{}}}\n".format(label) 27 | latex_table += "\end{table}\n" 28 | 29 | return latex_table 30 | 31 | 32 | def generate_summary_table(results_https, results_https_dns, results_enhanced_30, headers, caption, label): 33 | models_order = ["XGBoost", "Random forest", "AdaBoost", "Log. Regression", "Neural net", "NB - Gaussian", "k-NN", "Decision tree"] 34 | 35 | https = dict() 36 | for line in results_https.split("\n"): 37 | line = map(string.strip, line.split("\t")) 38 | https[line[0]] = [round(float(l),3) for l in line[1:]] 39 | 40 | https_dns = dict() 41 | for line in results_https_dns.split("\n"): 42 | line = map(string.strip, line.split("\t")) 43 | https_dns[line[0]] = [round(float(l),3) for l in line[1:]] 44 | 45 | enhanced_30 = dict() 46 | for line in results_enhanced_30.split("\n"): 47 | line = map(string.strip, line.split("\t")) 48 | enhanced_30[line[0]] = [round(float(l),3) for l in line[1:]] 49 | 50 | latex_table = "\\begin{table}[htbp]\n" 51 | latex_table += "\\begin{center}\n" \ 52 | "\\begin{tabular}{l|ll||ll||ll|} %l:left c:center r:right |:table lines\n" \ 53 | "\cmidrule[1pt]{2-7} % 1pt is the thickness 3-10 is column number\n" \ 54 | "&\multicolumn{2}{c||}{HTTPS}&\multicolumn{2}{c||}{HTTPS + DNS}&\multicolumn{2}{c|}{Enhanced} \\\\ \\cmidrule{2-7}\n" \ 55 | "&\multicolumn{1}{c|}{Acc}&\multicolumn{1}{c||}{FPR}&\multicolumn{1}{c|}{Acc}&\multicolumn{1}{c||}{FPR}&\multicolumn{1}{c|}{Acc}&\multicolumn{1}{c|}{FPR}\\\\ \\midrule \n" 56 | 57 | for model in models_order: 58 | latex_table += "{} & {:.3f} & {:.3f} & {:.3f} & {:.3f} & {:.3f} & {:.3f} \\\\ \n".format(model, https[model][4], https[model][3], https_dns[model][4], https_dns[model][3], enhanced_30[model][4], enhanced_30[model][3]) 59 | 60 | latex_table += "\\midrule\end{tabular}\n" \ 61 | "\end{center}" 62 | latex_table += "\caption{{{}}}\n".format(caption) 63 | latex_table += "\label{{{}}}\n".format(label) 64 | latex_table += "\end{table}\n" 65 | 66 | return latex_table 67 | 68 | if __name__ == '__main__': 69 | 70 | headers = ['Model', 'Best score', 'TPR', 'TNR', 'FPR', 'Acc', 'Err', 'Pre', 'F-M', 'MCC', 'AUC'] 71 | 72 | # https 73 | results_https = "XGBoost 0.9853618866901599 0.984 0.987 0.013 0.985 0.015 0.987 0.985 0.971 0.985\n \ 74 | Random forest 0.97289238276 0.969 0.977 0.023 0.973 0.027 0.977 0.973 0.946 0.973\n \ 75 | Decision tree 0.955543507726 0.95 0.961 0.039 0.956 0.044 0.961 0.955 0.911 0.956\n \ 76 | AdaBoost 0.951206288967 0.952 0.95 0.05 0.951 0.049 0.95 0.951 0.902 0.951\n \ 77 | k-NN 0.880726484142 0.871 0.89 0.11 0.881 0.119 0.888 0.88 0.762 0.881\n \ 78 | Neural net 0.8359989156953104 0.862 0.81 0.19 0.836 0.164 0.819 0.84 0.673 0.836\n \ 79 | Log. Regression 0.817565735972 0.796 0.839 0.161 0.818 0.182 0.832 0.813 0.636 0.818\n \ 80 | NB - Gaussian 0.5917592843589049 0.234 0.949 0.051 0.592 0.408 0.821 0.364 0.262 0.591" 81 | 82 | # https + dns 83 | results_https_dns = "XGBoost 0.9886148007590133 0.985 0.992 0.008 0.989 0.011 0.992 0.989 0.977 0.989\n \ 84 | AdaBoost\t0.970452697208 0.971 0.97 0.03 0.97 0.03 0.97 0.97 0.941 0.97\n \ 85 | Random forest 0.969910544863 0.958 0.982 0.018 0.97 0.03 0.982 0.97 0.94 0.97\n \ 86 | Decision tree 0.956627812415 0.946 0.967 0.033 0.957 0.043 0.967 0.956 0.913 0.957\n \ 87 | k-NN 0.905394415831 0.893 0.918 0.082 0.905 0.095 0.916 0.904 0.811 0.905\n \ 88 | Neural net 0.9024125779343996 0.939 0.866 0.134 0.902 0.098 0.875 0.906 0.807 0.902\n \ 89 | Log. Regression\t0.877744646246 0.863 0.893 0.107 0.878 0.122 0.889 0.876 0.756 0.878\n \ 90 | NB - Gaussian 0.750338845216 0.555 0.945 0.055 0.75 0.25 0.91 0.69 0.544 0.75\n" 91 | 92 | # enhanced_30 feature set 93 | results_enhanced_30 = "XGBoost 0.999 0.995 0.999 0.001 0.997 0.003 0.999 0.997 0.994 0.997\n \ 94 | Random forest 0.999453850355 0.993 0.999 0.001 0.996 0.004 0.999 0.996 0.992 0.996\n \ 95 | Decision tree 0.999437570304 0.964 0.999 0.001 0.982 0.018 0.999 0.981 0.964 0.982\n \ 96 | k-NN 0.998904709748 0.99 0.999 0.001 0.994 0.006 0.999 0.994 0.989 0.994\n \ 97 | AdaBoost 0.997018162104 0.995 0.999 0.001 0.997 0.003 0.999 0.997 0.994 0.997\n \ 98 | Neural net 0.996476009759 0.993 0.999 0.001 0.996 0.004 0.999 0.996 0.993 0.996\n \ 99 | Log. Regression 0.995662781242 0.992 0.999 0.001 0.996 0.004 0.999 0.996 0.991 0.996\n \ 100 | NB - Gaussian 0.994578476552 0.993 0.996 0.004 0.995 0.005 0.996 0.995 0.989 0.995\n \ 101 | " 102 | results_enhanced_30 = "XGBoost 0.9994547437295529 0.994574064026 0.999458288191 0.000541711809317 0.997018162104 0.00298183789645 0.99945474373 0.997008430786 0.994048130058 0.9970161761083636\n \ 103 | Log. Regression 0.9994532531437944 0.991861096039 0.999458288191 0.000541711809317 0.995662781242 0.00433721875847 0.999453253144 0.995642701525 0.991354060016 0.9956596921148746\n \ 104 | Decision tree 0.999437570304 0.964188822572 0.999458288191 0.000541711809317 0.981837896449 0.0181621035511 0.999437570304 0.981496824082 0.964273690987 0.981823555381\n \ 105 | Random forest 0.998910675381 0.995116657623 0.998916576381 0.00108342361863 0.997018162104 0.00298183789645 0.998910675381 0.997010057081 0.994043460307 0.997016617002\n \ 106 | k-NN 0.998904709748 0.989690721649 0.998916576381 0.00108342361863 0.99430740038 0.00569259962049 0.998904709748 0.994276369583 0.988656700454 0.994303649015\n \ 107 | AdaBoost 0.997018162104 0.994574064026 0.999458288191 0.000541711809317 0.997018162104 0.00298183789645 0.99945474373 0.997008430786 0.994048130058 0.997016176108\n \ 108 | Neural net 0.996476009759 0.993488876831 0.999458288191 0.000541711809317 0.996476009759 0.00352399024126 0.999454148472 0.996462585034 0.992969638722 0.996473582511\n \ 109 | NB - Gaussian 0.994578476552 0.993488876831 0.995666305525 0.00433369447454 0.994578476552 0.00542152344809 0.995649809679 0.994568169473 0.989159252766 0.994577591178\n" 110 | 111 | 112 | #print(generate_table(results_https, headers, "Result HTTPS", "table:result_https")) 113 | 114 | #print(generate_table(results_https_dns, headers, "Result HTTPS + DNS", "table:result_https_dns")) 115 | 116 | #print(generate_table(results_enhanced_30, headers, "Result Enhanced features set", "table:result_enhanced_features_set")) 117 | 118 | print(generate_summary_table(results_https, results_https_dns, results_enhanced_30, headers, "Summary of the results", "table:result_summary")) 119 | 120 | -------------------------------------------------------------------------------- /tools/split_alexa.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | def binarySearch(alist, item): 4 | first = 0 5 | last = len(alist)-1 6 | found = False 7 | 8 | while first<=last and not found: 9 | pos = 0 10 | midpoint = (first + last)//2 11 | if alist[midpoint] == item: 12 | pos = midpoint 13 | found = True 14 | else: 15 | if item < alist[midpoint]: 16 | last = midpoint-1 17 | else: 18 | first = midpoint+1 19 | return found 20 | 21 | def sort_and_write(l, filename): 22 | l_sorted = sorted(l, key=str.lower) 23 | with open(filename, 'wb') as csvfile: 24 | csvwriter = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_MINIMAL) 25 | csvwriter.writerow(l_sorted) 26 | 27 | 28 | 29 | with open('alexa.csv', 'rb') as csvfile: 30 | csvreader = csv.reader(csvfile, delimiter=',') 31 | alexa = list() 32 | for row in csvreader: 33 | alexa.append(row[1]) 34 | 35 | #sort_and_write(alexa[:100], 'top-100.csv') 36 | #sort_and_write(alexa[100:1000], 'top-101-1000.csv') 37 | #sort_and_write(alexa[1000:10000], 'top-1001-10000.csv') 38 | #sort_and_write(alexa[10000:100000], 'top-10001-100000.csv') 39 | #sort_and_write(alexa[100000:1000000], 'top-100001-1000000.csv') 40 | 41 | alexa_sorted = sorted(alexa, key=str.lower) 42 | 43 | 44 | import time 45 | import datetime 46 | """ 47 | start_time = time.time() 48 | binarySearch(alexa_sorted, "wikipedia.org") 49 | total_time = datetime.timedelta(seconds=time.time() - start_time) 50 | print("Binary search total time : " + str(total_time))#.strftime('%H:%M:%S')) 51 | 52 | start_time = time.time() 53 | "wikipedia.org" in alexa_sorted 54 | total_time = datetime.timedelta(seconds=time.time() - start_time) 55 | print("In search total time : " + str(total_time))#.strftime('%H:%M:%S')) 56 | """ 57 | 58 | print binarySearch(alexa_sorted, "wikiphjhedia.org") 59 | 60 | -------------------------------------------------------------------------------- /tools/timeFunction.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def benchmark(func, *params): 4 | import datetime 5 | import time 6 | start_time = time.time() 7 | return_value = func(*params) if params else func() 8 | total_time = datetime.timedelta(seconds=time.time() - start_time) 9 | print("Function " + func.__name__ + " - execution time : " + str(total_time))#.strftime('%H:%M:%S')) 10 | return return_value 11 | 12 | 13 | def test(): 14 | total = 0 15 | for i in range(0, 10000): 16 | total +=i 17 | return total 18 | 19 | def sum(param1, param2): 20 | return param1 + param2 21 | 22 | print benchmark(sum, 1, 2) 23 | 24 | print benchmark(test) -------------------------------------------------------------------------------- /tools/tls_finger.bro: -------------------------------------------------------------------------------- 1 | module TlsFingerprint; 2 | 3 | ## This script comes from https://www.securityartwork.es/2017/02/02/tls-client-fingerprinting-with-bro/ 4 | 5 | export { 6 | redef enum Log::ID += { LOG }; 7 | 8 | type Info: record { 9 | ts: time &log; 10 | uid: string &log; 11 | id: conn_id &log; 12 | tls_version: string &log; 13 | ciphers: vector of string &log; 14 | }; 15 | } 16 | 17 | event bro_init() &priority=5 18 | { 19 | Log::create_stream(TlsFingerprint::LOG, [$columns=Info, $path="tls_finger"]); 20 | } 21 | 22 | event ssl_client_hello (c: connection, 23 | version: count, 24 | possible_ts: time, 25 | client_random: string, 26 | session_id: string, 27 | ciphers: index_vec) 28 | { 29 | local ciphers_str: vector of string; 30 | local rec: TlsFingerprint::Info; 31 | 32 | for (i in ciphers) { 33 | ciphers_str[i] = SSL::cipher_desc[ciphers[i]]; 34 | } 35 | 36 | rec$ts = network_time(); 37 | rec$uid = c$uid; 38 | rec$id = c$id; 39 | rec$tls_version = SSL::version_strings[version]; 40 | #rec$ciphers = join_string_vec(ciphers_str, ","); 41 | rec$ciphers = ciphers_str; 42 | 43 | Log::write(TlsFingerprint::LOG, rec); 44 | } --------------------------------------------------------------------------------