├── .gitignore
├── LICENSE.md
├── README.md
├── __init__.py
├── dataset_tools
    ├── __init.py__.py
    ├── collect_infected_ips.py
    ├── discard_unuseful_datasets.py
    ├── download_datasets.py
    ├── infected_ips.json
    ├── label_mcfp_datasets.py
    ├── label_normal_datasets.py
    └── normal_ips.json
├── example_config.py
├── features_extraction
    ├── CertificateFeatures.py
    ├── ComputeFeatures.py
    ├── Connection4tuple.py
    ├── ConnectionFeatures.py
    ├── DNSConnection.py
    ├── DNSFeatures.py
    ├── DatasetInformation.py
    ├── ExtractFeatures.py
    ├── MainBro.py
    ├── __init.py__.py
    └── top_level_domain
├── logger.py
├── machine_learning
    ├── Get_normalize_data.py
    ├── __init.py__.py
    ├── features_selection.py
    ├── model.py
    ├── normalize_and_split.py
    └── train.py
├── main_tools.py
├── statistics
    ├── __init.py__.py
    ├── datasets_statistics.py
    ├── dns_features_statistics.py
    ├── dns_features_stats.ipynb
    └── models_stats.ipynb
└── tools
    ├── __init.py__.py
    ├── backup_results.py
    ├── check_IP.py
    ├── delete_results.py
    ├── download_datasets_gdrive.sh
    ├── entropy.py
    ├── extract_bro_ciphers.py
    ├── generate_features_table.py
    ├── generate_results_table.py
    ├── split_alexa.py
    ├── timeFunction.py
    └── tls_finger.bro


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 
108 | # PyCharm
109 | .idea
110 | 
111 | # Config file
112 | config.py
113 | .DS_Store
114 | /results/graphs/
115 | /results/logs/
116 | /results/model/
117 | /results/features/
118 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright for portions of project BotnetDetectionThesis are held by František Střasák 2018 as part of project HTTPSDetector (https://github.com/frenky-strasak/HTTPSDetector). All other copyright for project BotnetDetectionThesis are held by lminy 2018.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # BotnetDetectorThesis
 2 | 
 3 | This implementation was realized for my master thesis on "Botnet detection in encrypted traffic - a machine learning approach" 
 4 | 
 5 | ## Configuration
 6 | The configuration has to be done in config.py file. A template is provided in example_config.py
 7 | 
 8 | 
 9 | ## Run
10 | 
11 | Follow these steps:
12 | 1. run features_extraction/MainBro.py to extract the features in results/features.csv
13 | 2. run machine_learning/normalize_and_split.py to generate data to feed to ML
14 | 3. run train.py to generate models
15 | 
16 | ## Choosing the set of features to train
17 | 
18 | Pass the setname of the features to use through 
19 | ```Python
20 | Get_normalize_data.get_all_data("model_folder", "set_name")
21 | ```
22 | setname can take the value "all", "dns", "https", "reduced", "reduced_30", "reduced_40" and "enhanced_30".
23 | To create a new set of features, just complete the *features_set* dictionnary present in the *get_all_data(...)* function
24 | 
25 | ## Generate the enhanced features set
26 | The enhanced features set contains cipher suites from ClientHello packets.
27 | Unfortunately the information is not available by default in Bro logs.
28 | Therefore it is required to extract them by hand. The tls_finger.bro script from [securityartwork.es](https://www.securityartwork.es/2017/02/02/tls-client-fingerprinting-with-bro/) has been used in order to do this extraction
29 | Moreover, to avoid re-computing the whole features set (which is time and ressources consuming),
30 | the features are calculated separately then added to the csv with all features.
31 | 
32 | Here are the steps to generate the enhanced features set:
33 | 
34 | 1. Install [Bro](https://www.bro.org/download/index.html) or install [SecurityOnion](https://securityonion.net/) and put the **tls_finger.bro** file into the folder **"/usr/local/share/bro/site"**
35 | 2. Use **extract_bro_ciphers.py** to extract cipher suites from Bro logs
36 | 3. Use **feature_extraction/compute_ciphersuites_features.ipynb** to compute the features from Bro logs and store them in **results/model/features_enhanced.csv**
37 | 
38 | 
39 | ## Project structure
40 | - **dataset_tools/** -> contains all the tools related to the datasets (download, collect infected IPs, label and discard datasets)
41 |     - **download_datasets.py**: to download the desired datasets
42 |     - **discard_unuseful_datasets.py**: to discard datasets that have no flows labelled
43 |     - **collect_infected_ips.py**: to collect infected and normal IPs from README.html files present in the dataset folders (uses a regex to parse the files)
44 |     - **label_normal_datasets.py**: to label normal datasets
45 |     - **label_mcfp_datasets.py**: to label MFCP datasets (excluding the "CTU-13 Dataset" which is already labelled)
46 | - **features_extraction/** -> contains the scripts that extract the features. Credits go to [Frantisek Strasak](https://github.com/frenky-strasak) for HTTPS features extractions.
47 | - **machine_learning/** -> contains the scripts to normalize the data from the features extracted and train the model
48 | - **results/{graphs|logs|model}** -> default folders for generated graphs, models and logs
49 | - **results_backup/** -> contains the backup results of the different experiments
50 | - **statistics/** -> contains the scripts to analyze the features extracted and the models generated
51 | - **tools/** -> Various tools: 
52 |     - **tls_finger.bro**: Bro script to extract cipher suites
53 |     - **extract_bro_ciphers.py**: Python script to extract logs + cipher suites from pcap's
54 |     - **backup_results.py**: to backup the result folder (requires "results_folder_backup" to be set in config file)
55 |     - **delete_results.py**: to delete the result folder
56 |     - **split_alexa.py**: to sort and split alexa top websites in multiple files for quicker lookups
57 | 
58 | ## Main requirements
59 | - [Python 2.7](https://www.python.org/download/releases/2.7/)
60 | - [Jupyter notebook](https://jupyter.org/install)
61 | - [Numpy](http://www.numpy.org/)
62 | - [SciPy](https://www.scipy.org/install.html)
63 | - [sklearn](http://scikit-learn.org/stable/install.html)
64 | - [XGBoost](https://github.com/dmlc/xgboost/tree/master/python-package)
65 | 
66 | 
67 | ## License
68 | BotnetDetectorThesis is released under the MIT license. Credits go to František Střasák for some parts of the code (https://github.com/frenky-strasak/HTTPSDetector).


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lminy/BotnetDetectionThesis/5a54541229a6d7255f0eebe65aaf8b1c35b9be04/__init__.py


--------------------------------------------------------------------------------
/dataset_tools/__init.py__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lminy/BotnetDetectionThesis/5a54541229a6d7255f0eebe65aaf8b1c35b9be04/dataset_tools/__init.py__.py


--------------------------------------------------------------------------------
/dataset_tools/collect_infected_ips.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import json
  4 | import re
  5 | import config as c
  6 | 
  7 | 
  8 | def run(cmd):
  9 |     import subprocess
 10 |     return subprocess.check_output(cmd)
 11 | 
 12 | infected_ips = dict()
 13 | normal_ips = dict()
 14 | 
 15 | 
 16 | def print_ips(dataset_name=None):
 17 |     if dataset_name is None:
 18 |         print "Infected IPs : " + json.dumps(infected_ips)
 19 |         print "Normal IPs : " + json.dumps(normal_ips)
 20 |     else:
 21 |         print "Infected IPs : " + json.dumps(infected_ips[dataset_name])
 22 |         print "Normal IPs : " + json.dumps(normal_ips[dataset_name])
 23 | 
 24 | 
 25 | # Loads json files
 26 | if os.path.exists("./infected_ips.json") and os.path.exists("./normal_ips.json"):
 27 |     with open('./infected_ips.json', 'r') as f:
 28 |         infected_ips = json.load(f)
 29 | 
 30 |     with open('./normal_ips.json', 'r') as f:
 31 |         normal_ips = json.load(f)
 32 | 
 33 |     print_ips()
 34 | 
 35 | 
 36 | infected_ips_collected_by_hand = {
 37 |     "CTU-Malware-Capture-Botnet-25-1":["10.0.2.106"],
 38 |     "CTU-Malware-Capture-Botnet-25-2":["10.0.2.103"],
 39 |     "CTU-Malware-Capture-Botnet-25-3":["10.0.2.103"],
 40 |     "CTU-Malware-Capture-Botnet-25-4":["10.0.2.103"],
 41 |     "CTU-Malware-Capture-Botnet-25-5":["10.0.2.103"],
 42 |     "CTU-Malware-Capture-Botnet-25-6":["10.0.2.103"],
 43 |     "CTU-Malware-Capture-Botnet-31-1":["10.0.2.110"],
 44 |     "CTU-Malware-Capture-Botnet-69":["10.0.2.117"],
 45 |     "CTU-Malware-Capture-Botnet-78-2":["10.0.2.108"],
 46 |     "CTU-Malware-Capture-Botnet-78-1":["10.0.2.108"],
 47 |     "CTU-Malware-Capture-Botnet-83-1":["10.0.2.102"],
 48 |     "CTU-Malware-Capture-Botnet-83-2":["10.0.2.102"],
 49 |     "CTU-Malware-Capture-Botnet-90":["192.168.3.104"],
 50 |     "CTU-Malware-Capture-Botnet-261-4":['192.168.1.'+str(i) for i in range(0,256)],
 51 |     "CTU-Malware-Capture-Botnet-301-1":['192.168.1.'+str(i) for i in range(0,256)],
 52 |     "CTU-Malware-Capture-Botnet-321-1":['192.168.1.'+str(i) for i in range(0,256)],
 53 | 
 54 | }
 55 | 
 56 | infected_ips.update(infected_ips_collected_by_hand)
 57 | 
 58 | with open('./infected_ips.json', 'w') as f:
 59 |     f.write(json.dumps(infected_ips))
 60 | 
 61 | 
 62 | normal_ips_collected_by_hand = {
 63 |     "CTU-Malware-Capture-Botnet-25-1":[""],
 64 |     "CTU-Malware-Capture-Botnet-25-2":[""],
 65 |     "CTU-Malware-Capture-Botnet-25-3":[""],
 66 |     "CTU-Malware-Capture-Botnet-25-4":[""],
 67 |     "CTU-Malware-Capture-Botnet-25-5":[""],
 68 |     "CTU-Malware-Capture-Botnet-25-6":[""],
 69 |     "CTU-Malware-Capture-Botnet-31-1":[""],
 70 |     "CTU-Malware-Capture-Botnet-69":[""],
 71 |     "CTU-Malware-Capture-Botnet-78-1":[""],
 72 |     "CTU-Malware-Capture-Botnet-78-2":[""],
 73 |     "CTU-Malware-Capture-Botnet-83-1":[""],
 74 |     "CTU-Malware-Capture-Botnet-83-2":[""],
 75 |     "CTU-Malware-Capture-Botnet-90":[""],
 76 |     "CTU-Malware-Capture-Botnet-261-4":[""],
 77 |     "CTU-Malware-Capture-Botnet-301-1":[""],
 78 |     "CTU-Malware-Capture-Botnet-321-1":[""],
 79 | }
 80 | 
 81 | normal_ips.update(normal_ips_collected_by_hand)
 82 | 
 83 | with open('./normal_ips.json', 'w') as f:
 84 |     f.write(json.dumps(normal_ips))
 85 | 
 86 | 
 87 | index = 0
 88 | for sub_set in os.listdir(c.datasets_folder_general):
 89 |     if sub_set.startswith(".") or not os.path.exists(c.datasets_folder_general + sub_set + '/bro/ssl.log'):
 90 |         continue
 91 | 
 92 |     dataset_folder = c.datasets_folder_general + sub_set
 93 | 
 94 |     index += 1
 95 | 
 96 |     dataset_number = int(sub_set.split('-')[4])
 97 |     if sub_set.startswith("CTU-Malware-Capture-Botnet-") and (dataset_number <= 42 or dataset_number >= 54):
 98 |         print "========================================================"
 99 |         print "======== #" + str(index) + " " + sub_set
100 |         print "========================================================"
101 |         if sub_set in infected_ips:
102 |             print "Already checked! :)"
103 |             print_ips(sub_set)
104 |             continue
105 | 
106 |         #print os.listdir(dataset_folder)
107 |         for filename in os.listdir(dataset_folder):
108 |             if "README.html" in filename:
109 |                 ips = list()
110 | 
111 |                 with open(dataset_folder + "/" + filename) as f:
112 |                     for line in f:
113 |                         matchObj = re.match('.*Infected host: (\d+\.\d+\.\d+\.\d+).*', line)
114 | 
115 |                         if matchObj:
116 |                             ips.append(matchObj.group(1))
117 | 
118 |                 if len(ips) > 0:
119 |                     print "IPs Found : " + str(ips)
120 |                     infected_ips[sub_set] = ips
121 |                     with open('./infected_ips.json', 'w') as f:
122 |                         f.write(json.dumps(infected_ips))
123 |                     normal_ips[sub_set] = [""]
124 |                     with open('./normal_ips.json', 'w') as f:
125 |                         f.write(json.dumps(normal_ips))
126 |                 else:
127 |                     print "No match!!"
128 |                     print "------------------------------------"
129 |                     print "---------- Infected hosts"
130 |                     #print run(["grep", "-i", "-C", "3", "Infected", dataset_folder + "/" + filename])
131 |                     ips = str(raw_input())
132 |                     infected_ips[sub_set] = ips.split(",")
133 |                     with open('./infected_ips.json', 'w') as f:
134 |                         f.write(json.dumps(infected_ips))
135 | 
136 |                     print "------------------------------------"
137 |                     print "---------- Normal hosts"
138 |                     #print run(["grep", "-i", "-C", "3", 'Normal', dataset_folder + "/" + filename])
139 |                     ips = str(raw_input())
140 |                     normal_ips[sub_set] = ips.split(",")
141 |                     with open('./normal_ips.json', 'w') as f:
142 |                         f.write(json.dumps(normal_ips))
143 |                     break
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/dataset_tools/discard_unuseful_datasets.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import json
 4 | import config as c
 5 | 
 6 | with open('./infected_ips.json', 'r') as f:
 7 |     infected_ips = json.load(f)
 8 | 
 9 | with open('./normal_ips.json', 'r') as f:
10 |     normal_ips = json.load(f)
11 | 
12 | index = 0
13 | for sub_set in os.listdir(c.datasets_folder_general):
14 |     if sub_set.startswith(".") or not os.path.exists(datasets_folder + sub_set + '/bro/ssl.log'):
15 |         continue
16 | 
17 |     dataset_folder = c.datasets_folder_general + sub_set
18 | 
19 |     index += 1
20 | 
21 |     dataset_number = int(sub_set.split('-')[4])
22 |     if sub_set.startswith("CTU-Malware-Capture-Botnet-") and (dataset_number <= 42 or dataset_number >= 54):
23 |         print("========================================================")
24 |         print("======== #" + str(index) + " " + sub_set)
25 |         print("========================================================")
26 |         if len(infected_ips[sub_set][0]) == 0 and \
27 |             len(normal_ips[sub_set][0]) == 0:
28 |             print("Moving dataset {} ({}) to {}".format(sub_set, dataset_folder, folder_other_datasets))
29 |             shutil.move(dataset_folder, c.datasets_discarded_folder)


--------------------------------------------------------------------------------
/dataset_tools/download_datasets.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Download all datasets which have bro folder.
  3 | USAGE:
  4 | python download_datasets.py https://mcfp.felk.cvut.cz/publicDatasets/
  5 | """
  6 | 
  7 | import sys
  8 | from bs4 import BeautifulSoup
  9 | import requests
 10 | import requests.packages.urllib3.exceptions
 11 | requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
 12 | import urllib2
 13 | import ssl
 14 | import os
 15 | import shutil
 16 | import config as c
 17 | import time
 18 | import datetime
 19 | from logger import get_logger
 20 | 
 21 | 
 22 | import logging
 23 | #import config as c
 24 | 
 25 | 
 26 | logger = get_logger('debug')
 27 | 
 28 | 
 29 | files_to_download = ["ssl.log", "x509.log", "weird.log", "conn.log", "dns.log"]
 30 | 
 31 | # Normal datasets
 32 | datasets_to_download = [
 33 | 'CTU-Normal-20/',
 34 | 'CTU-Normal-21/',
 35 | 'CTU-Normal-22/',
 36 | 'CTU-Normal-23/',
 37 | 'CTU-Normal-24/',
 38 | 'CTU-Normal-25/',
 39 | 'CTU-Normal-26/',
 40 | 'CTU-Normal-27/',
 41 | 'CTU-Normal-28/',
 42 | 'CTU-Normal-29/',
 43 | 'CTU-Normal-30/',
 44 | 'CTU-Normal-31/',
 45 | 'CTU-Normal-32/'
 46 | ]
 47 | 
 48 | # THE CTU-13 DATASET
 49 | datasets_to_download = [
 50 | 'CTU-Malware-Capture-Botnet-42/',
 51 | 'CTU-Malware-Capture-Botnet-43/',
 52 | 'CTU-Malware-Capture-Botnet-44/',
 53 | 'CTU-Malware-Capture-Botnet-45/',
 54 | 'CTU-Malware-Capture-Botnet-46/',
 55 | 'CTU-Malware-Capture-Botnet-47/',
 56 | 'CTU-Malware-Capture-Botnet-48/',
 57 | 'CTU-Malware-Capture-Botnet-49/',
 58 | 'CTU-Malware-Capture-Botnet-50/',
 59 | 'CTU-Malware-Capture-Botnet-51/',
 60 | 'CTU-Malware-Capture-Botnet-52/',
 61 | 'CTU-Malware-Capture-Botnet-53/',
 62 | 'CTU-Malware-Capture-Botnet-54/',
 63 | ]
 64 | 
 65 | # Whole datasets
 66 | datasets_to_download = ["CTU-Malware-Capture-Botnet-1", "CTU-Malware-Capture-Botnet-102", "CTU-Malware-Capture-Botnet-111-1", "CTU-Malware-Capture-Botnet-116-1", "CTU-Malware-Capture-Botnet-116-2", "CTU-Malware-Capture-Botnet-138-1", "CTU-Malware-Capture-Botnet-157-1", "CTU-Malware-Capture-Botnet-163-1", "CTU-Malware-Capture-Botnet-164-1", "CTU-Malware-Capture-Botnet-169-1", "CTU-Malware-Capture-Botnet-169-2", "CTU-Malware-Capture-Botnet-169-3", "CTU-Malware-Capture-Botnet-17-1", "CTU-Malware-Capture-Botnet-17-2", "CTU-Malware-Capture-Botnet-174-1", "CTU-Malware-Capture-Botnet-175-1", "CTU-Malware-Capture-Botnet-177-1", "CTU-Malware-Capture-Botnet-178-1", "CTU-Malware-Capture-Botnet-179-1", "CTU-Malware-Capture-Botnet-180-1", "CTU-Malware-Capture-Botnet-181-1", "CTU-Malware-Capture-Botnet-183-1", "CTU-Malware-Capture-Botnet-184-1", "CTU-Malware-Capture-Botnet-185-1", "CTU-Malware-Capture-Botnet-186-1", "CTU-Malware-Capture-Botnet-187-1", "CTU-Malware-Capture-Botnet-188-1", "CTU-Malware-Capture-Botnet-188-2", "CTU-Malware-Capture-Botnet-188-3", "CTU-Malware-Capture-Botnet-188-4", "CTU-Malware-Capture-Botnet-189-1", "CTU-Malware-Capture-Botnet-189-2", "CTU-Malware-Capture-Botnet-193-1", "CTU-Malware-Capture-Botnet-193-2", "CTU-Malware-Capture-Botnet-194-1", "CTU-Malware-Capture-Botnet-195-1", "CTU-Malware-Capture-Botnet-196-1", "CTU-Malware-Capture-Botnet-198-1", "CTU-Malware-Capture-Botnet-199-1", "CTU-Malware-Capture-Botnet-199-2", "CTU-Malware-Capture-Botnet-200-1", "CTU-Malware-Capture-Botnet-201-1", "CTU-Malware-Capture-Botnet-202-1", "CTU-Malware-Capture-Botnet-203-1", "CTU-Malware-Capture-Botnet-204-1", "CTU-Malware-Capture-Botnet-205-1", "CTU-Malware-Capture-Botnet-205-2", "CTU-Malware-Capture-Botnet-208-2", "CTU-Malware-Capture-Botnet-209-1", "CTU-Malware-Capture-Botnet-210-1", "CTU-Malware-Capture-Botnet-211-1", "CTU-Malware-Capture-Botnet-211-2", "CTU-Malware-Capture-Botnet-213-1", "CTU-Malware-Capture-Botnet-215-1", "CTU-Malware-Capture-Botnet-215-2", "CTU-Malware-Capture-Botnet-217-1", "CTU-Malware-Capture-Botnet-218-1", "CTU-Malware-Capture-Botnet-219-1", "CTU-Malware-Capture-Botnet-219-2", "CTU-Malware-Capture-Botnet-219-3", "CTU-Malware-Capture-Botnet-220-1", "CTU-Malware-Capture-Botnet-221-1", "CTU-Malware-Capture-Botnet-221-2", "CTU-Malware-Capture-Botnet-222-1", "CTU-Malware-Capture-Botnet-224-1", "CTU-Malware-Capture-Botnet-227-1", "CTU-Malware-Capture-Botnet-228-1", "CTU-Malware-Capture-Botnet-230-1", "CTU-Malware-Capture-Botnet-230-2", "CTU-Malware-Capture-Botnet-231-1", "CTU-Malware-Capture-Botnet-232-1", "CTU-Malware-Capture-Botnet-235-1", "CTU-Malware-Capture-Botnet-237-1", "CTU-Malware-Capture-Botnet-238-1", "CTU-Malware-Capture-Botnet-239-1", "CTU-Malware-Capture-Botnet-240-1", "CTU-Malware-Capture-Botnet-241-1", "CTU-Malware-Capture-Botnet-242-1", "CTU-Malware-Capture-Botnet-243-1", "CTU-Malware-Capture-Botnet-244-1", "CTU-Malware-Capture-Botnet-245-1", "CTU-Malware-Capture-Botnet-246-1", "CTU-Malware-Capture-Botnet-247-1", "CTU-Malware-Capture-Botnet-248-1", "CTU-Malware-Capture-Botnet-249-1", "CTU-Malware-Capture-Botnet-25-1", "CTU-Malware-Capture-Botnet-25-2", "CTU-Malware-Capture-Botnet-25-3", "CTU-Malware-Capture-Botnet-25-4", "CTU-Malware-Capture-Botnet-25-5", "CTU-Malware-Capture-Botnet-25-6", "CTU-Malware-Capture-Botnet-251-1", "CTU-Malware-Capture-Botnet-253-1", "CTU-Malware-Capture-Botnet-254-1", "CTU-Malware-Capture-Botnet-257-1", "CTU-Malware-Capture-Botnet-260-1", "CTU-Malware-Capture-Botnet-261-1", "CTU-Malware-Capture-Botnet-261-2", "CTU-Malware-Capture-Botnet-261-3", "CTU-Malware-Capture-Botnet-261-4", "CTU-Malware-Capture-Botnet-263-1", "CTU-Malware-Capture-Botnet-264-1", "CTU-Malware-Capture-Botnet-265-1", "CTU-Malware-Capture-Botnet-266-1", "CTU-Malware-Capture-Botnet-267-1", "CTU-Malware-Capture-Botnet-270-1", "CTU-Malware-Capture-Botnet-273-1", "CTU-Malware-Capture-Botnet-274-1", "CTU-Malware-Capture-Botnet-275-1", "CTU-Malware-Capture-Botnet-277-1", "CTU-Malware-Capture-Botnet-278-1", "CTU-Malware-Capture-Botnet-279-1", "CTU-Malware-Capture-Botnet-280-1", "CTU-Malware-Capture-Botnet-281-1", "CTU-Malware-Capture-Botnet-282-1", "CTU-Malware-Capture-Botnet-285-1", "CTU-Malware-Capture-Botnet-287-1", "CTU-Malware-Capture-Botnet-290-1", "CTU-Malware-Capture-Botnet-291-1", "CTU-Malware-Capture-Botnet-292-1", "CTU-Malware-Capture-Botnet-293-1", "CTU-Malware-Capture-Botnet-294-1", "CTU-Malware-Capture-Botnet-295-1", "CTU-Malware-Capture-Botnet-296-1", "CTU-Malware-Capture-Botnet-297-1", "CTU-Malware-Capture-Botnet-299-1", "CTU-Malware-Capture-Botnet-300-1", "CTU-Malware-Capture-Botnet-301-1", "CTU-Malware-Capture-Botnet-302-1", "CTU-Malware-Capture-Botnet-303-1", "CTU-Malware-Capture-Botnet-305-1", "CTU-Malware-Capture-Botnet-305-2", "CTU-Malware-Capture-Botnet-306-1", "CTU-Malware-Capture-Botnet-308-1", "CTU-Malware-Capture-Botnet-31-1", "CTU-Malware-Capture-Botnet-315-1", "CTU-Malware-Capture-Botnet-318-1", "CTU-Malware-Capture-Botnet-320-1", "CTU-Malware-Capture-Botnet-320-2", "CTU-Malware-Capture-Botnet-321-1", "CTU-Malware-Capture-Botnet-322-1", "CTU-Malware-Capture-Botnet-323-1", "CTU-Malware-Capture-Botnet-324-1", "CTU-Malware-Capture-Botnet-325-1", "CTU-Malware-Capture-Botnet-326-1", "CTU-Malware-Capture-Botnet-327-1", "CTU-Malware-Capture-Botnet-327-2", "CTU-Malware-Capture-Botnet-328-1", "CTU-Malware-Capture-Botnet-329-1", "CTU-Malware-Capture-Botnet-334-1", "CTU-Malware-Capture-Botnet-335-1", "CTU-Malware-Capture-Botnet-336-1", "CTU-Malware-Capture-Botnet-339-1", "CTU-Malware-Capture-Botnet-340-1", "CTU-Malware-Capture-Botnet-341-1", "CTU-Malware-Capture-Botnet-344-1", "CTU-Malware-Capture-Botnet-345-1", "CTU-Malware-Capture-Botnet-346-1", "CTU-Malware-Capture-Botnet-348-1", "CTU-Malware-Capture-Botnet-349-1", "CTU-Malware-Capture-Botnet-350-1", "CTU-Malware-Capture-Botnet-352-1", "CTU-Malware-Capture-Botnet-354-1", "CTU-Malware-Capture-Botnet-42", "CTU-Malware-Capture-Botnet-43", "CTU-Malware-Capture-Botnet-44", "CTU-Malware-Capture-Botnet-45", "CTU-Malware-Capture-Botnet-46", "CTU-Malware-Capture-Botnet-47", "CTU-Malware-Capture-Botnet-48", "CTU-Malware-Capture-Botnet-49", "CTU-Malware-Capture-Botnet-50", "CTU-Malware-Capture-Botnet-51", "CTU-Malware-Capture-Botnet-52", "CTU-Malware-Capture-Botnet-53", "CTU-Malware-Capture-Botnet-54", "CTU-Malware-Capture-Botnet-69", "CTU-Malware-Capture-Botnet-78-1", "CTU-Malware-Capture-Botnet-78-2", "CTU-Malware-Capture-Botnet-83-1", "CTU-Malware-Capture-Botnet-83-2", "CTU-Malware-Capture-Botnet-90", "CTU-Normal-12", "CTU-Normal-20", "CTU-Normal-21", "CTU-Normal-22", "CTU-Normal-23", "CTU-Normal-24", "CTU-Normal-25", "CTU-Normal-26", "CTU-Normal-27" "CTU-Normal-28", "CTU-Normal-29", "CTU-Normal-30", "CTU-Normal-31", "CTU-Normal-32", "CTU-Normal-6-filtered", "CTU-Normal-7", "CTU-Normal-8-1", "CTU-Normal-8-2", "CTU-Normal-9"]
 67 | 
 68 | 
 69 | def find_files(url):
 70 |     soup = BeautifulSoup(requests.get(url, verify=False).text, "lxml")
 71 |     hrefs = []
 72 |     for a in soup.find_all('a'):
 73 |         if 'href' in a.attrs :
 74 |             hrefs.append(a['href'])
 75 |     return hrefs
 76 | 
 77 | 
 78 | def compute_datasets_size(url):
 79 |     dataset_names = find_files(url)
 80 |     file_sizes = 0
 81 |     for i in range(len(dataset_names)):
 82 |         if dataset_names[i].replace("/", "") in datasets_to_download:
 83 |         #if 'CTU-Malware-Capture-Botnet-' in dataset_names[i] or 'CTU-Normal-' in dataset_names[i]:
 84 |             #number_name = int(dataset_names[i].split('-')[4].replace('/', ''))
 85 | 
 86 |             #if number_name < 248:
 87 |             #    continue
 88 | 
 89 |             logger.info(url + dataset_names[i])
 90 | 
 91 |             # Get content of the main page of dataset.
 92 |             content = find_files(url + dataset_names[i])
 93 | 
 94 |             # Look into open folder to files there. There are binetflow, bro, ...
 95 |             # And find the bro folder in this list.
 96 |             for j in range(len(content)):
 97 |                 if 'bro' in content[j]:
 98 |                     #print dataset_names[i] + content[j]
 99 |                     file_sizes += save_manager(url, dataset_names[i])
100 |                     break
101 | 
102 |     return file_sizes
103 | 
104 | 
105 | def save_manager(url, dataset_name):
106 |     file_sizes = 0
107 |     bro_files = find_files(url + dataset_name + 'bro/')
108 | 
109 |     if 'ssl.log' in bro_files:
110 |         directory_name = c.datasets_folder_general + dataset_name
111 |         #if os.path.exists(directory_name):
112 |         #    shutil.rmtree(directory_name)
113 | 
114 |         if not os.path.exists(directory_name):
115 |             os.makedirs(directory_name)
116 | 
117 | 
118 |         url_dataset = url + dataset_name
119 |         for filename in find_files(url_dataset):
120 |             # Download Readme file
121 |             if "README" in filename and not os.path.exists(directory_name + filename):
122 |                 save_file(url_dataset + filename, directory_name + filename)
123 |             # Download pcap file
124 |             if filename.endswith(".pcap") and not os.path.exists(directory_name + filename):
125 |                 save_file(url_dataset + filename, directory_name + filename)
126 |         #url_file = url + dataset_name + "README.html"
127 |         #file_name = directory_name + "README.html"
128 | 
129 | 
130 | 
131 | 
132 | 
133 |         folder_bro = directory_name + "bro/"
134 |         if not os.path.exists(folder_bro):
135 |             os.makedirs(folder_bro)
136 | 
137 |         for bro_log in bro_files:
138 |             if bro_log.endswith('.log') and bro_log in files_to_download:
139 |                 if not os.path.exists(directory_name + "bro/" + bro_log): # If file does not exists on hdd
140 |                     logger.info(url + dataset_name)
141 |                     url_file = url + dataset_name + 'bro/' + bro_log
142 |                     file_sizes += save_file(url_file, folder_bro + bro_log)
143 | 
144 |     return file_sizes
145 | 
146 | 
147 | def save_file(url_file, file_name):
148 |     logger.info(url_file + " is downloading...")
149 |     file_size = 0
150 |     # https://stackoverflow.com/a/28052583
151 |     req = urllib2.Request(url, headers={ 'X-Mashape-Key': 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' })
152 |     gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
153 |     u = urllib2.urlopen(url_file, context=gcontext)
154 |     meta = u.info()
155 |     file_size += int(meta.getheaders("Content-Length")[0])
156 | 
157 |     f = open(file_name, 'wb')
158 |     #logger.info("Downloading: %s Bytes: %s" % (file_name, file_size))
159 | 
160 |     file_size_dl = 0
161 |     block_sz = 8192
162 |     while True:
163 |         buffer = u.read(block_sz)
164 |         if not buffer:
165 |             break
166 | 
167 |         file_size_dl += len(buffer)
168 |         f.write(buffer)
169 |         status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
170 |         status = status + chr(8) * (len(status) + 1)
171 |         #logger.info(status)
172 | 
173 |     f.close()
174 |     return file_size
175 | 
176 | 
177 | if __name__ == '__main__':
178 |     start_time = time.time()
179 |     datasets_size = 0
180 |     if len(sys.argv) == 2:
181 |         url = sys.argv[1]
182 |         datasets_size += compute_datasets_size(url)
183 |         # find_files(url+'CTU-Malware-Capture-Botnet-31/')
184 |     else:
185 |         logger.error("Error: Please put argument.")
186 |     logger.info("Complet Dataset size:" + str(datasets_size / (1024.0 * 1024.0)) + "MB")
187 |     total_time = datetime.timedelta(seconds=time.time() - start_time)
188 |     logger.info("Time : " + str(total_time))  # .strftime('%H:%M:%S'))
189 | 
190 | 


--------------------------------------------------------------------------------
/dataset_tools/infected_ips.json:
--------------------------------------------------------------------------------
1 | {"CTU-Malware-Capture-Botnet-116-4": [""], "CTU-Malware-Capture-Botnet-336-1": ["192.168.1.120"], "CTU-Malware-Capture-Botnet-116-2": ["192.168.0.250,192.168.0.251"], "CTU-Malware-Capture-Botnet-221-2": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-221-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-116-1": [""], "CTU-Malware-Capture-Botnet-117-1": [""], "CTU-Malware-Capture-Botnet-322-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-335-1": ["192.168.1.118"], "CTU-Malware-Capture-Botnet-227-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-208-2": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-321-1": ["192.168.1.0", "192.168.1.1", "192.168.1.2", "192.168.1.3", "192.168.1.4", "192.168.1.5", "192.168.1.6", "192.168.1.7", "192.168.1.8", "192.168.1.9", "192.168.1.10", "192.168.1.11", "192.168.1.12", "192.168.1.13", "192.168.1.14", "192.168.1.15", "192.168.1.16", "192.168.1.17", "192.168.1.18", "192.168.1.19", "192.168.1.20", "192.168.1.21", "192.168.1.22", "192.168.1.23", "192.168.1.24", "192.168.1.25", "192.168.1.26", "192.168.1.27", "192.168.1.28", "192.168.1.29", "192.168.1.30", "192.168.1.31", "192.168.1.32", "192.168.1.33", "192.168.1.34", "192.168.1.35", "192.168.1.36", "192.168.1.37", "192.168.1.38", "192.168.1.39", "192.168.1.40", "192.168.1.41", "192.168.1.42", "192.168.1.43", "192.168.1.44", "192.168.1.45", "192.168.1.46", "192.168.1.47", "192.168.1.48", "192.168.1.49", "192.168.1.50", "192.168.1.51", "192.168.1.52", "192.168.1.53", "192.168.1.54", "192.168.1.55", "192.168.1.56", "192.168.1.57", "192.168.1.58", "192.168.1.59", "192.168.1.60", "192.168.1.61", "192.168.1.62", "192.168.1.63", "192.168.1.64", "192.168.1.65", "192.168.1.66", "192.168.1.67", "192.168.1.68", "192.168.1.69", "192.168.1.70", "192.168.1.71", "192.168.1.72", "192.168.1.73", "192.168.1.74", "192.168.1.75", "192.168.1.76", "192.168.1.77", "192.168.1.78", "192.168.1.79", "192.168.1.80", "192.168.1.81", "192.168.1.82", "192.168.1.83", "192.168.1.84", "192.168.1.85", "192.168.1.86", "192.168.1.87", "192.168.1.88", "192.168.1.89", "192.168.1.90", "192.168.1.91", "192.168.1.92", "192.168.1.93", "192.168.1.94", "192.168.1.95", "192.168.1.96", "192.168.1.97", "192.168.1.98", "192.168.1.99", "192.168.1.100", "192.168.1.101", "192.168.1.102", "192.168.1.103", "192.168.1.104", "192.168.1.105", "192.168.1.106", "192.168.1.107", "192.168.1.108", "192.168.1.109", "192.168.1.110", "192.168.1.111", "192.168.1.112", "192.168.1.113", "192.168.1.114", "192.168.1.115", "192.168.1.116", "192.168.1.117", "192.168.1.118", "192.168.1.119", "192.168.1.120", "192.168.1.121", "192.168.1.122", "192.168.1.123", "192.168.1.124", "192.168.1.125", "192.168.1.126", "192.168.1.127", "192.168.1.128", "192.168.1.129", "192.168.1.130", "192.168.1.131", "192.168.1.132", "192.168.1.133", "192.168.1.134", "192.168.1.135", "192.168.1.136", "192.168.1.137", "192.168.1.138", "192.168.1.139", "192.168.1.140", "192.168.1.141", "192.168.1.142", "192.168.1.143", "192.168.1.144", "192.168.1.145", "192.168.1.146", "192.168.1.147", "192.168.1.148", "192.168.1.149", "192.168.1.150", "192.168.1.151", "192.168.1.152", "192.168.1.153", "192.168.1.154", "192.168.1.155", "192.168.1.156", "192.168.1.157", "192.168.1.158", "192.168.1.159", "192.168.1.160", "192.168.1.161", "192.168.1.162", "192.168.1.163", "192.168.1.164", "192.168.1.165", "192.168.1.166", "192.168.1.167", "192.168.1.168", "192.168.1.169", "192.168.1.170", "192.168.1.171", "192.168.1.172", "192.168.1.173", "192.168.1.174", "192.168.1.175", "192.168.1.176", "192.168.1.177", "192.168.1.178", "192.168.1.179", "192.168.1.180", "192.168.1.181", "192.168.1.182", "192.168.1.183", "192.168.1.184", "192.168.1.185", "192.168.1.186", "192.168.1.187", "192.168.1.188", "192.168.1.189", "192.168.1.190", "192.168.1.191", "192.168.1.192", "192.168.1.193", "192.168.1.194", "192.168.1.195", "192.168.1.196", "192.168.1.197", "192.168.1.198", "192.168.1.199", "192.168.1.200", "192.168.1.201", "192.168.1.202", "192.168.1.203", "192.168.1.204", "192.168.1.205", "192.168.1.206", "192.168.1.207", "192.168.1.208", "192.168.1.209", "192.168.1.210", "192.168.1.211", "192.168.1.212", "192.168.1.213", "192.168.1.214", "192.168.1.215", "192.168.1.216", "192.168.1.217", "192.168.1.218", "192.168.1.219", "192.168.1.220", "192.168.1.221", "192.168.1.222", "192.168.1.223", "192.168.1.224", "192.168.1.225", "192.168.1.226", "192.168.1.227", "192.168.1.228", "192.168.1.229", "192.168.1.230", "192.168.1.231", "192.168.1.232", "192.168.1.233", "192.168.1.234", "192.168.1.235", "192.168.1.236", "192.168.1.237", "192.168.1.238", "192.168.1.239", "192.168.1.240", "192.168.1.241", "192.168.1.242", "192.168.1.243", "192.168.1.244", "192.168.1.245", "192.168.1.246", "192.168.1.247", "192.168.1.248", "192.168.1.249", "192.168.1.250", "192.168.1.251", "192.168.1.252", "192.168.1.253", "192.168.1.254", "192.168.1.255"], "CTU-Malware-Capture-Botnet-111-1": ["10.0.2.110"], "CTU-Malware-Capture-Botnet-112-4": [""], "CTU-Malware-Capture-Botnet-78-2": ["10.0.2.108"], "CTU-Malware-Capture-Botnet-266-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-205-2": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-112-2": [""], "CTU-Malware-Capture-Botnet-112-1": [""], "CTU-Malware-Capture-Botnet-257-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-203-1": ["192.168.1.118"], "CTU-Malware-Capture-Botnet-261-3": ["192.168.1.124"], "CTU-Malware-Capture-Botnet-261-2": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-261-1": ["192.168.1.125"], "CTU-Malware-Capture-Botnet-163-1": ["10.0.2.106"], "CTU-Malware-Capture-Botnet-339-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-261-4": ["192.168.1.0", "192.168.1.1", "192.168.1.2", "192.168.1.3", "192.168.1.4", "192.168.1.5", "192.168.1.6", "192.168.1.7", "192.168.1.8", "192.168.1.9", "192.168.1.10", "192.168.1.11", "192.168.1.12", "192.168.1.13", "192.168.1.14", "192.168.1.15", "192.168.1.16", "192.168.1.17", "192.168.1.18", "192.168.1.19", "192.168.1.20", "192.168.1.21", "192.168.1.22", "192.168.1.23", "192.168.1.24", "192.168.1.25", "192.168.1.26", "192.168.1.27", "192.168.1.28", "192.168.1.29", "192.168.1.30", "192.168.1.31", "192.168.1.32", "192.168.1.33", "192.168.1.34", "192.168.1.35", "192.168.1.36", "192.168.1.37", "192.168.1.38", "192.168.1.39", "192.168.1.40", "192.168.1.41", "192.168.1.42", "192.168.1.43", "192.168.1.44", "192.168.1.45", "192.168.1.46", "192.168.1.47", "192.168.1.48", "192.168.1.49", "192.168.1.50", "192.168.1.51", "192.168.1.52", "192.168.1.53", "192.168.1.54", "192.168.1.55", "192.168.1.56", "192.168.1.57", "192.168.1.58", "192.168.1.59", "192.168.1.60", "192.168.1.61", "192.168.1.62", "192.168.1.63", "192.168.1.64", "192.168.1.65", "192.168.1.66", "192.168.1.67", "192.168.1.68", "192.168.1.69", "192.168.1.70", "192.168.1.71", "192.168.1.72", "192.168.1.73", "192.168.1.74", "192.168.1.75", "192.168.1.76", "192.168.1.77", "192.168.1.78", "192.168.1.79", "192.168.1.80", "192.168.1.81", "192.168.1.82", "192.168.1.83", "192.168.1.84", "192.168.1.85", "192.168.1.86", "192.168.1.87", "192.168.1.88", "192.168.1.89", "192.168.1.90", "192.168.1.91", "192.168.1.92", "192.168.1.93", "192.168.1.94", "192.168.1.95", "192.168.1.96", "192.168.1.97", "192.168.1.98", "192.168.1.99", "192.168.1.100", "192.168.1.101", "192.168.1.102", "192.168.1.103", "192.168.1.104", "192.168.1.105", "192.168.1.106", "192.168.1.107", "192.168.1.108", "192.168.1.109", "192.168.1.110", "192.168.1.111", "192.168.1.112", "192.168.1.113", "192.168.1.114", "192.168.1.115", "192.168.1.116", "192.168.1.117", "192.168.1.118", "192.168.1.119", "192.168.1.120", "192.168.1.121", "192.168.1.122", "192.168.1.123", "192.168.1.124", "192.168.1.125", "192.168.1.126", "192.168.1.127", "192.168.1.128", "192.168.1.129", "192.168.1.130", "192.168.1.131", "192.168.1.132", "192.168.1.133", "192.168.1.134", "192.168.1.135", "192.168.1.136", "192.168.1.137", "192.168.1.138", "192.168.1.139", "192.168.1.140", "192.168.1.141", "192.168.1.142", "192.168.1.143", "192.168.1.144", "192.168.1.145", "192.168.1.146", "192.168.1.147", "192.168.1.148", "192.168.1.149", "192.168.1.150", "192.168.1.151", "192.168.1.152", "192.168.1.153", "192.168.1.154", "192.168.1.155", "192.168.1.156", "192.168.1.157", "192.168.1.158", "192.168.1.159", "192.168.1.160", "192.168.1.161", "192.168.1.162", "192.168.1.163", "192.168.1.164", "192.168.1.165", "192.168.1.166", "192.168.1.167", "192.168.1.168", "192.168.1.169", "192.168.1.170", "192.168.1.171", "192.168.1.172", "192.168.1.173", "192.168.1.174", "192.168.1.175", "192.168.1.176", "192.168.1.177", "192.168.1.178", "192.168.1.179", "192.168.1.180", "192.168.1.181", "192.168.1.182", "192.168.1.183", "192.168.1.184", "192.168.1.185", "192.168.1.186", "192.168.1.187", "192.168.1.188", "192.168.1.189", "192.168.1.190", "192.168.1.191", "192.168.1.192", "192.168.1.193", "192.168.1.194", "192.168.1.195", "192.168.1.196", "192.168.1.197", "192.168.1.198", "192.168.1.199", "192.168.1.200", "192.168.1.201", "192.168.1.202", "192.168.1.203", "192.168.1.204", "192.168.1.205", "192.168.1.206", "192.168.1.207", "192.168.1.208", "192.168.1.209", "192.168.1.210", "192.168.1.211", "192.168.1.212", "192.168.1.213", "192.168.1.214", "192.168.1.215", "192.168.1.216", "192.168.1.217", "192.168.1.218", "192.168.1.219", "192.168.1.220", "192.168.1.221", "192.168.1.222", "192.168.1.223", "192.168.1.224", "192.168.1.225", "192.168.1.226", "192.168.1.227", "192.168.1.228", "192.168.1.229", "192.168.1.230", "192.168.1.231", "192.168.1.232", "192.168.1.233", "192.168.1.234", "192.168.1.235", "192.168.1.236", "192.168.1.237", "192.168.1.238", "192.168.1.239", "192.168.1.240", "192.168.1.241", "192.168.1.242", "192.168.1.243", "192.168.1.244", "192.168.1.245", "192.168.1.246", "192.168.1.247", "192.168.1.248", "192.168.1.249", "192.168.1.250", "192.168.1.251", "192.168.1.252", "192.168.1.253", "192.168.1.254", "192.168.1.255"], "CTU-Malware-Capture-Botnet-145-1": [""], "CTU-Malware-Capture-Botnet-17-2": ["10.0.2.119"], "CTU-Malware-Capture-Botnet-17-1": ["10.0.2.118"], "CTU-Malware-Capture-Botnet-123-1": [""], "CTU-Malware-Capture-Botnet-341-1": ["192.168.1.134"], "CTU-Malware-Capture-Botnet-164-1": ["10.0.2.200"], "CTU-Malware-Capture-Botnet-110-1": [""], "CTU-Malware-Capture-Botnet-270-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-137-1": [""], "CTU-Malware-Capture-Botnet-169-1": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-169-2": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-169-3": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-281-1": ["192.168.1.123"], "CTU-Malware-Capture-Botnet-349-1": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-243-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-230-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-230-2": ["192.168.1.117"], "CTU-Malware-Capture-Botnet-179-1": ["10.0.2.113"], "CTU-Malware-Capture-Botnet-282-1": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-350-1": ["192.168.1.126"], "CTU-Malware-Capture-Botnet-244-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-177-1": ["10.0.2.11"], "CTU-Malware-Capture-Botnet-174-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-247-1": ["192.168.1.124"], "CTU-Malware-Capture-Botnet-297-1": ["192.168.1.135"], "CTU-Malware-Capture-Botnet-306-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-142-1": [""], "CTU-Malware-Capture-Botnet-239-1": ["192.168.1.117"], "CTU-Malware-Capture-Botnet-141-2": [""], "CTU-Malware-Capture-Botnet-301-1": ["192.168.1.0", "192.168.1.1", "192.168.1.2", "192.168.1.3", "192.168.1.4", "192.168.1.5", "192.168.1.6", "192.168.1.7", "192.168.1.8", "192.168.1.9", "192.168.1.10", "192.168.1.11", "192.168.1.12", "192.168.1.13", "192.168.1.14", "192.168.1.15", "192.168.1.16", "192.168.1.17", "192.168.1.18", "192.168.1.19", "192.168.1.20", "192.168.1.21", "192.168.1.22", "192.168.1.23", "192.168.1.24", "192.168.1.25", "192.168.1.26", "192.168.1.27", "192.168.1.28", "192.168.1.29", "192.168.1.30", "192.168.1.31", "192.168.1.32", "192.168.1.33", "192.168.1.34", "192.168.1.35", "192.168.1.36", "192.168.1.37", "192.168.1.38", "192.168.1.39", "192.168.1.40", "192.168.1.41", "192.168.1.42", "192.168.1.43", "192.168.1.44", "192.168.1.45", "192.168.1.46", "192.168.1.47", "192.168.1.48", "192.168.1.49", "192.168.1.50", "192.168.1.51", "192.168.1.52", "192.168.1.53", "192.168.1.54", "192.168.1.55", "192.168.1.56", "192.168.1.57", "192.168.1.58", "192.168.1.59", "192.168.1.60", "192.168.1.61", "192.168.1.62", "192.168.1.63", "192.168.1.64", "192.168.1.65", "192.168.1.66", "192.168.1.67", "192.168.1.68", "192.168.1.69", "192.168.1.70", "192.168.1.71", "192.168.1.72", "192.168.1.73", "192.168.1.74", "192.168.1.75", "192.168.1.76", "192.168.1.77", "192.168.1.78", "192.168.1.79", "192.168.1.80", "192.168.1.81", "192.168.1.82", "192.168.1.83", "192.168.1.84", "192.168.1.85", "192.168.1.86", "192.168.1.87", "192.168.1.88", "192.168.1.89", "192.168.1.90", "192.168.1.91", "192.168.1.92", "192.168.1.93", "192.168.1.94", "192.168.1.95", "192.168.1.96", "192.168.1.97", "192.168.1.98", "192.168.1.99", "192.168.1.100", "192.168.1.101", "192.168.1.102", "192.168.1.103", "192.168.1.104", "192.168.1.105", "192.168.1.106", "192.168.1.107", "192.168.1.108", "192.168.1.109", "192.168.1.110", "192.168.1.111", "192.168.1.112", "192.168.1.113", "192.168.1.114", "192.168.1.115", "192.168.1.116", "192.168.1.117", "192.168.1.118", "192.168.1.119", "192.168.1.120", "192.168.1.121", "192.168.1.122", "192.168.1.123", "192.168.1.124", "192.168.1.125", "192.168.1.126", "192.168.1.127", "192.168.1.128", "192.168.1.129", "192.168.1.130", "192.168.1.131", "192.168.1.132", "192.168.1.133", "192.168.1.134", "192.168.1.135", "192.168.1.136", "192.168.1.137", "192.168.1.138", "192.168.1.139", "192.168.1.140", "192.168.1.141", "192.168.1.142", "192.168.1.143", "192.168.1.144", "192.168.1.145", "192.168.1.146", "192.168.1.147", "192.168.1.148", "192.168.1.149", "192.168.1.150", "192.168.1.151", "192.168.1.152", "192.168.1.153", "192.168.1.154", "192.168.1.155", "192.168.1.156", "192.168.1.157", "192.168.1.158", "192.168.1.159", "192.168.1.160", "192.168.1.161", "192.168.1.162", "192.168.1.163", "192.168.1.164", "192.168.1.165", "192.168.1.166", "192.168.1.167", "192.168.1.168", "192.168.1.169", "192.168.1.170", "192.168.1.171", "192.168.1.172", "192.168.1.173", "192.168.1.174", "192.168.1.175", "192.168.1.176", "192.168.1.177", "192.168.1.178", "192.168.1.179", "192.168.1.180", "192.168.1.181", "192.168.1.182", "192.168.1.183", "192.168.1.184", "192.168.1.185", "192.168.1.186", "192.168.1.187", "192.168.1.188", "192.168.1.189", "192.168.1.190", "192.168.1.191", "192.168.1.192", "192.168.1.193", "192.168.1.194", "192.168.1.195", "192.168.1.196", "192.168.1.197", "192.168.1.198", "192.168.1.199", "192.168.1.200", "192.168.1.201", "192.168.1.202", "192.168.1.203", "192.168.1.204", "192.168.1.205", "192.168.1.206", "192.168.1.207", "192.168.1.208", "192.168.1.209", "192.168.1.210", "192.168.1.211", "192.168.1.212", "192.168.1.213", "192.168.1.214", "192.168.1.215", "192.168.1.216", "192.168.1.217", "192.168.1.218", "192.168.1.219", "192.168.1.220", "192.168.1.221", "192.168.1.222", "192.168.1.223", "192.168.1.224", "192.168.1.225", "192.168.1.226", "192.168.1.227", "192.168.1.228", "192.168.1.229", "192.168.1.230", "192.168.1.231", "192.168.1.232", "192.168.1.233", "192.168.1.234", "192.168.1.235", "192.168.1.236", "192.168.1.237", "192.168.1.238", "192.168.1.239", "192.168.1.240", "192.168.1.241", "192.168.1.242", "192.168.1.243", "192.168.1.244", "192.168.1.245", "192.168.1.246", "192.168.1.247", "192.168.1.248", "192.168.1.249", "192.168.1.250", "192.168.1.251", "192.168.1.252", "192.168.1.253", "192.168.1.254", "192.168.1.255"], "CTU-Malware-Capture-Botnet-141-1": [""], "CTU-Malware-Capture-Botnet-69": ["10.0.2.117"], "CTU-Malware-Capture-Botnet-273-1": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-295-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-184-1": ["192.168.1.102"], "CTU-Malware-Capture-Botnet-138-1": ["54.242.92.108", "10.0.0.151", "10.0.0.152", "10.0.0.153"], "CTU-Malware-Capture-Botnet-279-1": ["192.168.1.130"], "CTU-Malware-Capture-Botnet-25-4": ["10.0.2.103"], "CTU-Malware-Capture-Botnet-25-5": ["10.0.2.103"], "CTU-Malware-Capture-Botnet-25-6": ["10.0.2.103"], "CTU-Malware-Capture-Botnet-249-1": ["192.168.1.130"], "CTU-Malware-Capture-Botnet-25-1": ["10.0.2.106"], "CTU-Malware-Capture-Botnet-25-2": ["10.0.2.103"], "CTU-Malware-Capture-Botnet-25-3": ["10.0.2.103"], "CTU-Malware-Capture-Botnet-189-2": ["192.168.1.127"], "CTU-Malware-Capture-Botnet-189-1": ["192.168.1.117"], "CTU-Malware-Capture-Botnet-292-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-352-1": ["192.168.1.129"], "CTU-Malware-Capture-Botnet-31-1": ["10.0.2.110"], "CTU-Malware-Capture-Botnet-345-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-181-1": ["10.0.2.116"], "CTU-Malware-Capture-Botnet-183-1": ["192.168.1.102"], "CTU-Malware-Capture-Botnet-205-1": ["192.168.1.117"], "CTU-Malware-Capture-Botnet-305-2": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-324-1": ["192.168.1.123"], "CTU-Malware-Capture-Botnet-129-1": [""], "CTU-Malware-Capture-Botnet-264-1": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-90": ["192.168.3.104"], "CTU-Malware-Capture-Botnet-346-1": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-224-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-78-1": ["10.0.2.108"], "CTU-Malware-Capture-Botnet-318-1": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-199-1": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-194-1": ["192.168.1.125"], "CTU-Malware-Capture-Botnet-199-2": ["192.168.1.127"], "CTU-Malware-Capture-Botnet-274-1": ["192.168.1.123"], "CTU-Malware-Capture-Botnet-315-1": ["192.168.1.124"], "CTU-Malware-Capture-Botnet-219-2": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-219-3": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-219-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-277-1": ["192.168.1.128"], "CTU-Malware-Capture-Botnet-294-1": ["192.168.1.135"], "CTU-Malware-Capture-Botnet-291-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-327-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-327-2": ["192.168.113"], "CTU-Malware-Capture-Botnet-110-4": [""], "CTU-Malware-Capture-Botnet-110-5": [""], "CTU-Malware-Capture-Botnet-110-6": [""], "CTU-Malware-Capture-Botnet-220-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-293-1": ["192.168.1.135"], "CTU-Malware-Capture-Botnet-110-2": [""], "CTU-Malware-Capture-Botnet-211-2": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-211-1": ["192.168.1.126"], "CTU-Malware-Capture-Botnet-213-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-305-1": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-320-2": ["192.168.1.127"], "CTU-Malware-Capture-Botnet-320-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-290-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-251-1": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-201-1": ["192.168.1.125"], "CTU-Malware-Capture-Botnet-253-1": ["192.168.1.120"], "CTU-Malware-Capture-Botnet-209-1": ["192.168.1.123"], "CTU-Malware-Capture-Botnet-204-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-263-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-162-2": [""], "CTU-Malware-Capture-Botnet-162-1": [""], "CTU-Malware-Capture-Botnet-260-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-287-1": ["192.168.1.135"], "CTU-Malware-Capture-Botnet-334-1": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-323-1": ["192.168.1.127"], "CTU-Malware-Capture-Botnet-1": ["10.0.2.22", "10.0.2.112"], "CTU-Malware-Capture-Botnet-280-1": ["192.168.1.106"], "CTU-Malware-Capture-Botnet-202-1": ["192.168.1.130"], "CTU-Malware-Capture-Botnet-200-1": ["192.168.1.124"], "CTU-Malware-Capture-Botnet-120-1": [""], "CTU-Malware-Capture-Botnet-144-1": [""], "CTU-Malware-Capture-Botnet-143-1": [""], "CTU-Malware-Capture-Botnet-242-1": ["192.168.1.220"], "CTU-Malware-Capture-Botnet-240-1": ["192.168.1.118"], "CTU-Malware-Capture-Botnet-232-1": ["192.168.1.128"], "CTU-Malware-Capture-Botnet-178-1": ["10.0.2.112"], "CTU-Malware-Capture-Botnet-231-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-175-1": ["10.0.2.109"], "CTU-Malware-Capture-Botnet-340-1": ["192.168.1.118"], "CTU-Malware-Capture-Botnet-302-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-245-1": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-248-1": ["192.168.1.128"], "CTU-Malware-Capture-Botnet-299-1": ["192.168.1.135"], "CTU-Malware-Capture-Botnet-246-1": ["192.168.1.110"], "CTU-Malware-Capture-Botnet-238-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-328-1": ["192.168.1.122"], "CTU-Malware-Capture-Botnet-193-2": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-237-1": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-348-1": ["192.168.1.130"], "CTU-Malware-Capture-Botnet-235-1": ["192.168.1.126"], "CTU-Malware-Capture-Botnet-140-2": [""], "CTU-Malware-Capture-Botnet-140-1": [""], "CTU-Malware-Capture-Botnet-196-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-157-1": ["222.179.116.23"], "CTU-Malware-Capture-Botnet-186-1": ["192.168.1.128"], "CTU-Malware-Capture-Botnet-254-1": ["192.168.1.123"], "CTU-Malware-Capture-Botnet-217-1": ["192.168.1.118"], "CTU-Malware-Capture-Botnet-308-1": ["192.168.1.117"], "CTU-Malware-Capture-Botnet-241-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-329-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-102": ["10.0.2.102"], "CTU-Malware-Capture-Botnet-180-1": ["10.0.2.115"], "CTU-Malware-Capture-Botnet-185-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-300-1": ["192.168.1.106"], "CTU-Malware-Capture-Botnet-188-4": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-188-3": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-188-2": ["192.168.1.113"], "CTU-Malware-Capture-Botnet-188-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-303-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-296-1": ["192.168.1.112"], "CTU-Malware-Capture-Botnet-278-1": ["192.168.1.129"], "CTU-Malware-Capture-Botnet-275-1": ["192.168.1.124"], "CTU-Malware-Capture-Botnet-325-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-267-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-326-1": ["192.168.1.120"], "CTU-Malware-Capture-Botnet-153-1": [""], "CTU-Malware-Capture-Botnet-198-1": ["192.168.1.121"], "CTU-Malware-Capture-Botnet-354-1": ["192.168.1.115"], "CTU-Malware-Capture-Botnet-344-1": ["192.168.1.120"], "CTU-Malware-Capture-Botnet-83-1": ["10.0.2.102"], "CTU-Malware-Capture-Botnet-83-2": ["10.0.2.102"], "CTU-Malware-Capture-Botnet-228-1": ["192.168.1.123"], "CTU-Malware-Capture-Botnet-195-1": ["- Infected host: "], "CTU-Malware-Capture-Botnet-187-1": ["192.168.1.110"], "CTU-Malware-Capture-Botnet-215-2": ["192.168.1.129"], "CTU-Malware-Capture-Botnet-222-1": ["192.168.1.116"], "CTU-Malware-Capture-Botnet-193-1": ["192.168.1.130"], "CTU-Malware-Capture-Botnet-215-1": ["192.168.1.118"], "CTU-Malware-Capture-Botnet-317-1": [""], "CTU-Malware-Capture-Botnet-210-1": ["192.168.1.119"], "CTU-Malware-Capture-Botnet-218-1": ["192.168.1.129"], "CTU-Malware-Capture-Botnet-285-1": ["192.168.1.135"], "CTU-Malware-Capture-Botnet-265-1": ["192.168.1.114"], "CTU-Malware-Capture-Botnet-111-5": [""], "CTU-Malware-Capture-Botnet-333-1": [""]}


--------------------------------------------------------------------------------
/dataset_tools/label_mcfp_datasets.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import config as c
 3 | import json
 4 | 
 5 | def check_conn_label(dataset_path, normal_ips, infected_ips):
 6 |     print "<< Labeling " + dataset_path
 7 |     flow_array = []
 8 |     space = '\t'
 9 |     normal_label = 0
10 |     malware_label = 0
11 |     with open(dataset_path + '/bro/conn.log', 'r') as f:
12 |         for line in f:
13 |             newline = line
14 | 
15 |             if line[0] != '#':
16 |                 split = line.split('\t')
17 |                 src_address = split[2]
18 | 
19 |                 if src_address in normal_ips:
20 |                     newline = line.rstrip() + space + "From-Normal" + "\n"
21 |                     normal_label += 1
22 |                 elif src_address in infected_ips:
23 |                     newline = line.rstrip() + space + "From-Botnet" + "\n"
24 |                     malware_label += 1
25 |             else:
26 |                 if 'fields' in line:
27 |                     newline = line.rstrip() + space + "label" + "\n"
28 |                 elif 'types' in line:
29 |                     newline = line.rstrip() + space + "string" + "\n"
30 | 
31 |             flow_array.append(newline)
32 | 
33 |             if "#close" in line:
34 |                 break
35 | 
36 |     print "normals:", normal_label
37 |     print "malwares:", malware_label
38 |     print "     << End Labeling " + dataset_path
39 |     return flow_array
40 | 
41 | def write_conn(path, flow_array):
42 |     print "<< Writing new flows to " + path
43 |     index = 0
44 |     with open(path + '/bro/conn_label.log', 'w') as f:
45 |         for i in range(len(flow_array)):
46 |             f.write(flow_array[i])
47 |             index += 1
48 |     print "     << Number of lines:", index
49 |     print "<< New file conn_label.log was succesfly created."
50 | 
51 | if __name__ == '__main__':
52 | 
53 |     with open('./infected_ips.json', 'r') as f:
54 |         infected_ips = json.load(f)
55 | 
56 |     with open('./normal_ips.json', 'r') as f:
57 |         normal_ips = json.load(f)
58 | 
59 |     for sub_set in os.listdir(c.datasets_folder_general):
60 |         if sub_set.startswith("CTU-Malware-Capture-Botnet-") :
61 |             dataset_number = int(sub_set.split('-')[4])
62 |             if (dataset_number <= 42 or dataset_number >= 54) \
63 |                     and (sub_set in infected_ips or sub_set in normal_ips):
64 |                 flow_array = check_conn_label(c.datasets_folder_general + sub_set, normal_ips[sub_set], infected_ips[sub_set])
65 |                 write_conn(c.datasets_folder_general + sub_set, flow_array)
66 | 


--------------------------------------------------------------------------------
/dataset_tools/label_normal_datasets.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import config as c
 3 | 
 4 | 
 5 | def check_conn_label(dataset_path):
 6 |     print "<< Labeling " + dataset_path
 7 |     flow_array = []
 8 |     space = '	'
 9 |     normal_label = 0
10 |     with open(dataset_path + '/bro/conn.log', 'r') as f:
11 |         for line in f:
12 |             newline = line
13 |             if not ('#' == line[0]):
14 |                 newline = line.rstrip() + space + "From-Normal" + "\n"
15 |                 normal_label += 1
16 |             else:
17 |                 if 'fields' in line:
18 |                     newline = line.rstrip() + space + "label" + "\n"
19 |                 elif 'types' in line:
20 |                     newline = line.rstrip() + space + "string" + "\n"
21 | 
22 |             flow_array.append(newline)
23 | 
24 |     print "normals:", normal_label
25 |     print "     << End Labeling " + dataset_path
26 |     return flow_array
27 | 
28 | def write_conn(path, flow_array):
29 |     print "<< Writing new flows to " + path
30 |     index = 0
31 |     with open(path + '/bro/conn_label.log', 'w') as f:
32 |         for i in range(len(flow_array)):
33 |             f.write(flow_array[i])
34 |             index += 1
35 |     print "     << Number of lines:", index
36 |     print "<< New file conn_label.log was succesfly created."
37 | 
38 | if __name__ == '__main__':
39 |     for sub_set in os.listdir(c.datasets_folder):
40 |         if not sub_set.startswith(".") and sub_set.startswith("CTU-Normal"):
41 |             flow_array = check_conn_label(c.datasets_folder + sub_set)
42 |             write_conn(c.datasets_folder + sub_set, flow_array)
43 | 


--------------------------------------------------------------------------------
/dataset_tools/normal_ips.json:
--------------------------------------------------------------------------------
1 | {"CTU-Malware-Capture-Botnet-116-4": [""], "CTU-Malware-Capture-Botnet-336-1": [""], "CTU-Malware-Capture-Botnet-116-2": [""], "CTU-Malware-Capture-Botnet-221-2": [""], "CTU-Malware-Capture-Botnet-221-1": [""], "CTU-Malware-Capture-Botnet-116-1": ["46.105.227.94"], "CTU-Malware-Capture-Botnet-117-1": [""], "CTU-Malware-Capture-Botnet-322-1": [""], "CTU-Malware-Capture-Botnet-335-1": [""], "CTU-Malware-Capture-Botnet-227-1": [""], "CTU-Malware-Capture-Botnet-208-2": [""], "CTU-Malware-Capture-Botnet-321-1": [""], "CTU-Malware-Capture-Botnet-111-1": [""], "CTU-Malware-Capture-Botnet-112-4": [""], "CTU-Malware-Capture-Botnet-78-2": [""], "CTU-Malware-Capture-Botnet-266-1": [""], "CTU-Malware-Capture-Botnet-205-2": [""], "CTU-Malware-Capture-Botnet-112-2": [""], "CTU-Malware-Capture-Botnet-112-1": [""], "CTU-Malware-Capture-Botnet-257-1": [""], "CTU-Malware-Capture-Botnet-203-1": [""], "CTU-Malware-Capture-Botnet-261-3": [""], "CTU-Malware-Capture-Botnet-261-2": [""], "CTU-Malware-Capture-Botnet-261-1": [""], "CTU-Malware-Capture-Botnet-163-1": [""], "CTU-Malware-Capture-Botnet-339-1": [""], "CTU-Malware-Capture-Botnet-261-4": [""], "CTU-Malware-Capture-Botnet-145-1": [""], "CTU-Malware-Capture-Botnet-17-2": [""], "CTU-Malware-Capture-Botnet-17-1": [""], "CTU-Malware-Capture-Botnet-123-1": [""], "CTU-Malware-Capture-Botnet-341-1": [""], "CTU-Malware-Capture-Botnet-164-1": [""], "CTU-Malware-Capture-Botnet-110-1": [""], "CTU-Malware-Capture-Botnet-270-1": [""], "CTU-Malware-Capture-Botnet-137-1": [""], "CTU-Malware-Capture-Botnet-169-1": [""], "CTU-Malware-Capture-Botnet-169-2": [""], "CTU-Malware-Capture-Botnet-169-3": [""], "CTU-Malware-Capture-Botnet-281-1": [""], "CTU-Malware-Capture-Botnet-349-1": [""], "CTU-Malware-Capture-Botnet-243-1": [""], "CTU-Malware-Capture-Botnet-230-1": [""], "CTU-Malware-Capture-Botnet-230-2": [""], "CTU-Malware-Capture-Botnet-179-1": [""], "CTU-Malware-Capture-Botnet-282-1": [""], "CTU-Malware-Capture-Botnet-350-1": [""], "CTU-Malware-Capture-Botnet-244-1": [""], "CTU-Malware-Capture-Botnet-177-1": [""], "CTU-Malware-Capture-Botnet-174-1": [""], "CTU-Malware-Capture-Botnet-247-1": [""], "CTU-Malware-Capture-Botnet-297-1": [""], "CTU-Malware-Capture-Botnet-306-1": [""], "CTU-Malware-Capture-Botnet-142-1": [""], "CTU-Malware-Capture-Botnet-239-1": [""], "CTU-Malware-Capture-Botnet-141-2": [""], "CTU-Malware-Capture-Botnet-301-1": [""], "CTU-Malware-Capture-Botnet-141-1": [""], "CTU-Malware-Capture-Botnet-69": [""], "CTU-Malware-Capture-Botnet-273-1": [""], "CTU-Malware-Capture-Botnet-295-1": [""], "CTU-Malware-Capture-Botnet-184-1": [""], "CTU-Malware-Capture-Botnet-138-1": [""], "CTU-Malware-Capture-Botnet-279-1": [""], "CTU-Malware-Capture-Botnet-25-4": [""], "CTU-Malware-Capture-Botnet-25-5": [""], "CTU-Malware-Capture-Botnet-25-6": [""], "CTU-Malware-Capture-Botnet-249-1": [""], "CTU-Malware-Capture-Botnet-25-1": [""], "CTU-Malware-Capture-Botnet-25-2": [""], "CTU-Malware-Capture-Botnet-25-3": [""], "CTU-Malware-Capture-Botnet-189-2": [""], "CTU-Malware-Capture-Botnet-189-1": [""], "CTU-Malware-Capture-Botnet-292-1": [""], "CTU-Malware-Capture-Botnet-352-1": [""], "CTU-Malware-Capture-Botnet-31-1": [""], "CTU-Malware-Capture-Botnet-345-1": [""], "CTU-Malware-Capture-Botnet-181-1": [""], "CTU-Malware-Capture-Botnet-183-1": [""], "CTU-Malware-Capture-Botnet-205-1": [""], "CTU-Malware-Capture-Botnet-305-2": [""], "CTU-Malware-Capture-Botnet-324-1": [""], "CTU-Malware-Capture-Botnet-129-1": [""], "CTU-Malware-Capture-Botnet-264-1": [""], "CTU-Malware-Capture-Botnet-90": [""], "CTU-Malware-Capture-Botnet-346-1": [""], "CTU-Malware-Capture-Botnet-224-1": [""], "CTU-Malware-Capture-Botnet-78-1": [""], "CTU-Malware-Capture-Botnet-318-1": [""], "CTU-Malware-Capture-Botnet-199-1": [""], "CTU-Malware-Capture-Botnet-194-1": [""], "CTU-Malware-Capture-Botnet-199-2": [""], "CTU-Malware-Capture-Botnet-274-1": [""], "CTU-Malware-Capture-Botnet-315-1": [""], "CTU-Malware-Capture-Botnet-219-2": [""], "CTU-Malware-Capture-Botnet-219-3": [""], "CTU-Malware-Capture-Botnet-219-1": [""], "CTU-Malware-Capture-Botnet-277-1": [""], "CTU-Malware-Capture-Botnet-294-1": [""], "CTU-Malware-Capture-Botnet-291-1": [""], "CTU-Malware-Capture-Botnet-327-1": [""], "CTU-Malware-Capture-Botnet-327-2": [""], "CTU-Malware-Capture-Botnet-110-4": [""], "CTU-Malware-Capture-Botnet-110-5": [""], "CTU-Malware-Capture-Botnet-110-6": [""], "CTU-Malware-Capture-Botnet-220-1": [""], "CTU-Malware-Capture-Botnet-293-1": [""], "CTU-Malware-Capture-Botnet-110-2": [""], "CTU-Malware-Capture-Botnet-211-2": [""], "CTU-Malware-Capture-Botnet-211-1": [""], "CTU-Malware-Capture-Botnet-213-1": [""], "CTU-Malware-Capture-Botnet-305-1": [""], "CTU-Malware-Capture-Botnet-320-2": [""], "CTU-Malware-Capture-Botnet-320-1": [""], "CTU-Malware-Capture-Botnet-290-1": [""], "CTU-Malware-Capture-Botnet-251-1": [""], "CTU-Malware-Capture-Botnet-201-1": [""], "CTU-Malware-Capture-Botnet-253-1": [""], "CTU-Malware-Capture-Botnet-209-1": [""], "CTU-Malware-Capture-Botnet-204-1": [""], "CTU-Malware-Capture-Botnet-263-1": [""], "CTU-Malware-Capture-Botnet-162-2": [""], "CTU-Malware-Capture-Botnet-162-1": [""], "CTU-Malware-Capture-Botnet-260-1": [""], "CTU-Malware-Capture-Botnet-287-1": [""], "CTU-Malware-Capture-Botnet-334-1": [""], "CTU-Malware-Capture-Botnet-323-1": [""], "CTU-Malware-Capture-Botnet-238-1": [""], "CTU-Malware-Capture-Botnet-280-1": [""], "CTU-Malware-Capture-Botnet-202-1": [""], "CTU-Malware-Capture-Botnet-200-1": [""], "CTU-Malware-Capture-Botnet-120-1": [""], "CTU-Malware-Capture-Botnet-144-1": [""], "CTU-Malware-Capture-Botnet-143-1": [""], "CTU-Malware-Capture-Botnet-242-1": [""], "CTU-Malware-Capture-Botnet-240-1": [""], "CTU-Malware-Capture-Botnet-232-1": [""], "CTU-Malware-Capture-Botnet-178-1": [""], "CTU-Malware-Capture-Botnet-231-1": [""], "CTU-Malware-Capture-Botnet-175-1": [""], "CTU-Malware-Capture-Botnet-340-1": [""], "CTU-Malware-Capture-Botnet-302-1": [""], "CTU-Malware-Capture-Botnet-245-1": [""], "CTU-Malware-Capture-Botnet-248-1": [""], "CTU-Malware-Capture-Botnet-299-1": [""], "CTU-Malware-Capture-Botnet-246-1": [""], "CTU-Malware-Capture-Botnet-1": [""], "CTU-Malware-Capture-Botnet-328-1": [""], "CTU-Malware-Capture-Botnet-222-1": [""], "CTU-Malware-Capture-Botnet-237-1": [""], "CTU-Malware-Capture-Botnet-348-1": [""], "CTU-Malware-Capture-Botnet-235-1": [""], "CTU-Malware-Capture-Botnet-140-2": [""], "CTU-Malware-Capture-Botnet-140-1": [""], "CTU-Malware-Capture-Botnet-196-1": [""], "CTU-Malware-Capture-Botnet-157-1": [""], "CTU-Malware-Capture-Botnet-186-1": [""], "CTU-Malware-Capture-Botnet-254-1": [""], "CTU-Malware-Capture-Botnet-217-1": [""], "CTU-Malware-Capture-Botnet-308-1": [""], "CTU-Malware-Capture-Botnet-241-1": [""], "CTU-Malware-Capture-Botnet-329-1": [""], "CTU-Malware-Capture-Botnet-102": [""], "CTU-Malware-Capture-Botnet-180-1": [""], "CTU-Malware-Capture-Botnet-185-1": [""], "CTU-Malware-Capture-Botnet-300-1": [""], "CTU-Malware-Capture-Botnet-188-4": [""], "CTU-Malware-Capture-Botnet-188-3": [""], "CTU-Malware-Capture-Botnet-188-2": [""], "CTU-Malware-Capture-Botnet-188-1": [""], "CTU-Malware-Capture-Botnet-303-1": [""], "CTU-Malware-Capture-Botnet-296-1": [""], "CTU-Malware-Capture-Botnet-278-1": [""], "CTU-Malware-Capture-Botnet-275-1": [""], "CTU-Malware-Capture-Botnet-325-1": [""], "CTU-Malware-Capture-Botnet-267-1": [""], "CTU-Malware-Capture-Botnet-326-1": [""], "CTU-Malware-Capture-Botnet-153-1": [""], "CTU-Malware-Capture-Botnet-198-1": [""], "CTU-Malware-Capture-Botnet-354-1": [""], "CTU-Malware-Capture-Botnet-344-1": [""], "CTU-Malware-Capture-Botnet-83-1": [""], "CTU-Malware-Capture-Botnet-83-2": [""], "CTU-Malware-Capture-Botnet-228-1": [""], "CTU-Malware-Capture-Botnet-195-1": [""], "CTU-Malware-Capture-Botnet-187-1": [""], "CTU-Malware-Capture-Botnet-215-2": [""], "CTU-Malware-Capture-Botnet-193-2": [""], "CTU-Malware-Capture-Botnet-193-1": [""], "CTU-Malware-Capture-Botnet-215-1": [""], "CTU-Malware-Capture-Botnet-317-1": [""], "CTU-Malware-Capture-Botnet-210-1": [""], "CTU-Malware-Capture-Botnet-218-1": [""], "CTU-Malware-Capture-Botnet-285-1": [""], "CTU-Malware-Capture-Botnet-265-1": [""], "CTU-Malware-Capture-Botnet-111-5": [""], "CTU-Malware-Capture-Botnet-333-1":[""]}


--------------------------------------------------------------------------------
/example_config.py:
--------------------------------------------------------------------------------
 1 | # Only absolute paths
 2 | # /!\ Don't forget "/" at the end for folders
 3 | datasets_folder_general = "/Volumes/Data/datasets_general/folder/"
 4 | datasets_folder = "absolute_path/to/my/datasets/folder/"
 5 | datasets_discarded_folder = "absolute_path/to/my/discarded_datasets/folder/"
 6 | 
 7 | results_folder = "absolute_path/to/my/results/folder/"
 8 | results_folder_backup = "absolute_path/to/my/results_backup/folder/"
 9 | model_folder = results_folder + "model/"
10 | logs_folder = results_folder + "logs/"
11 | graphs_folder = results_folder + "graphs/"
12 | 
13 | alexa_folder = "absolute_path/to/my/alexa/folder/"
14 | top_level_domain_file = "absolute_path/to/my/top_level_domain/file"
15 | 
16 | training_output_file = model_folder + "training_output_file.txt"


--------------------------------------------------------------------------------
/features_extraction/CertificateFeatures.py:
--------------------------------------------------------------------------------
  1 | 
  2 | class CertificateFeatures:
  3 | 
  4 |     def __init__(self, cert_serial, x509_line):
  5 |         self.servernames_dict = dict()
  6 |         self.cert_serial = cert_serial
  7 |         self.x509_line = x509_line
  8 |         self.malware_labels = 0
  9 |         self.normal_labels = 0
 10 | 
 11 |         self.not_valid_certificate_number = 0
 12 |         self.cert_percent_validity = []
 13 |         self.is_CN_in_SAN_f = -1
 14 |         self.certificate_key_length = -1
 15 |         self.number_san_domains = 0
 16 |         self.number_x509_lines = 0
 17 | 
 18 |         self.process_certificate(x509_line)
 19 | 
 20 |     def process_certificate(self, x509_line):
 21 |         self.is_CN_in_SAN(x509_line)
 22 | 
 23 |         split = x509_line.split('	')
 24 | 
 25 |         self.certificate_key_length = float(split[11])
 26 | 
 27 |         # number of domain in san in x509
 28 |         if split[14] != '-':
 29 |             domains = len(split[14].split(','))
 30 |             self.number_san_domains += domains
 31 | 
 32 |     def add_server_name(self, server_name, label):
 33 |         try:
 34 |             if self.servernames_dict[server_name]:
 35 |                 pass
 36 |         except:
 37 |             self.servernames_dict[server_name] = 1
 38 | 
 39 |         if 'Botnet' in label:
 40 |             self.malware_labels += 1
 41 |         if 'Normal' in label:
 42 |             self.normal_labels += 1
 43 | 
 44 |     def contain_server_name(self, server_name):
 45 |         try:
 46 |             if self.servernames_dict[server_name]:
 47 |                 return self.x509_line
 48 |         except:
 49 |             return 0
 50 | 
 51 |     def is_malware(self):
 52 |         if self.malware_labels != 0 and self.normal_labels != 0:
 53 |             print "Error: There are more malwares and more normals! Cert serial:", self.cert_serial
 54 |             print "     " + "malwares:", self.malware_labels, "normals", self.normal_labels
 55 |             print "     " + "SNI:"
 56 |             print self.servernames_dict.keys()
 57 | 
 58 |         if self.malware_labels > self.normal_labels:
 59 |             return True
 60 |         return False
 61 | 
 62 |     def add_x509_line(self, x509_line):
 63 |         split = x509_line.split('	')
 64 | 
 65 |         if split[7] != '-' and split[6] != '-':
 66 |             try:
 67 |                 current_time = float(split[0])
 68 |                 before_date = float(split[6])
 69 |                 after_date = float(split[7])
 70 |                 if current_time > after_date or current_time < before_date:
 71 |                     self.not_valid_certificate_number += 1
 72 |                     # print split[1], before_date, current_time, after_date
 73 | 
 74 |                 # certificate ratio
 75 |                 norm_after = after_date - before_date  # 31622399
 76 |                 current_time_norm = current_time - before_date  # 12025263
 77 |                 self.cert_percent_validity.append(current_time_norm / norm_after)
 78 | 
 79 |                 self.number_x509_lines += 1
 80 |             except:
 81 |                 print "Certificate time length is broken."
 82 | 
 83 | 
 84 |     def is_CN_in_SAN(self, x509_line):
 85 |         x509_split = x509_line.split('	')
 86 |         if x509_split[14] != '-':
 87 |             CN_part = x509_split[4]
 88 |             SAN_dns_list = x509_split[14].split(',')
 89 |             for i in range(len(SAN_dns_list)):
 90 |                 if '*' in SAN_dns_list[i]:
 91 |                     SAN_dns_list[i] = SAN_dns_list[i].replace('*', '')
 92 |             hit_2 = 0
 93 |             for san_dns in SAN_dns_list:
 94 |                 if san_dns in CN_part:
 95 |                     hit_2 = 1
 96 |                     break
 97 |             self.is_CN_in_SAN_f = hit_2
 98 | 
 99 |     def get_label_of_connection(self):
100 |         if self.malware_labels > self.normal_labels:
101 |             return "MALWARE"
102 |         else:
103 |             return "NORMAL"
104 |     """
105 |     ------------- FEATERES ---------------
106 |     """
107 |     # 1 CN is there
108 |     # 0 is not there
109 |     # -1 is not define
110 |     def get_is_CN_in_SAN(self):
111 |         return self.is_CN_in_SAN_f
112 | 
113 |     def get_certificate_key_length(self):
114 |         return self.certificate_key_length
115 | 
116 |     def get_number_san_domains(self):
117 |         return self.number_san_domains
118 | 
119 |     def get_number_of_server_name(self):
120 |         return len(self.servernames_dict.keys())
121 | 
122 |     def get_not_valid_certificate_number(self):
123 |         if self.number_x509_lines != 0:
124 |             return self.not_valid_certificate_number / float(self.number_x509_lines)
125 |         return -1
126 | 
127 |     def get_certificate_ratio(self):
128 |         if len(self.cert_percent_validity) != 0:
129 |             temp = 0
130 |             for i in self.cert_percent_validity:
131 |                 temp += i
132 |             return temp / float(len(self.cert_percent_validity))
133 |         else:
134 |             return -1


--------------------------------------------------------------------------------
/features_extraction/ComputeFeatures.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from ExtractFeatures import ExtractFeatures
  3 | import config as c
  4 | 
  5 | from logger import get_logger
  6 | logger = get_logger("debug")
  7 | 
  8 | class ComputeFeatures(ExtractFeatures):
  9 | 
 10 |     def __init__(self):
 11 |         super(ComputeFeatures, self).__init__()
 12 |         self.file_time_name = str(datetime.strftime(datetime.utcnow(), "%Y-%m-%d_%H-%M"))
 13 | 
 14 |     def add_cert_to_non_cert_conn(self):
 15 |         for key in self.connection_4_tuples.keys():
 16 | 
 17 |             """
 18 |             implementig feature: connection which have no certificate, but have at least one SNI,
 19 |             look, if in certificate_objects_dict is such servername with certificate
 20 |             """
 21 |             break_v = 0
 22 |             if self.connection_4_tuples[key].get_amount_diff_certificates() == 0:
 23 | 
 24 |                 server_names = self.connection_4_tuples[key].get_SNI_list()
 25 |                 if len(server_names) != 0:
 26 |                     for cert_serial in self.certificate_dict.keys():
 27 |                         for server_name in server_names:
 28 |                             x509_line = self.certificate_dict[cert_serial].contain_server_name(server_name)
 29 |                             if x509_line != 0:
 30 |                                 self.connection_4_tuples[key].add_ssl_log_2(x509_line)
 31 |                                 print "This Certificate was added after process:", "cert_serial:", cert_serial, "server_name=",server_name, "4-tuple=", key, "label:", self.connection_4_tuples[key].get_label_of_connection()
 32 |                                 break_v = 1
 33 |                                 break
 34 |                         if break_v == 1:
 35 |                             break
 36 | 
 37 |     def create_balanced_dataset(self):
 38 |         import csv
 39 |         from collections import OrderedDict
 40 | 
 41 |         botnet_lines = list()
 42 |         normal_lines = list()
 43 | 
 44 |         for key, con4tuple in self.connection_4_tuples.iteritems():
 45 |             dest_ip = key[1]
 46 |             if dest_ip not in self.dns_connections_index:
 47 |                 print dest_ip + "NOT FOUND IN DNS RECORDS..."
 48 |             else:
 49 |                 dns_conn = self.dns_connections_index[dest_ip]
 50 | 
 51 |                 features = OrderedDict()
 52 |                 features["key"] = " ".join(key)
 53 |                 features["number_of_flows"] = con4tuple.get_number_of_flows()
 54 |                 features["average_of_duration"] = con4tuple.get_average_of_duration()
 55 |                 features["standard_deviation_duration"] = con4tuple.get_standard_deviation_duration()
 56 |                 features["percent_of_standard_deviation_duration"] = con4tuple.get_percent_of_standard_deviation_duration()
 57 |                 features["total_size_of_flows_orig"] = con4tuple.get_total_size_of_flows_orig()
 58 |                 features["total_size_of_flows_resp"] = con4tuple.get_total_size_of_flows_resp()
 59 |                 features["ratio_of_sizes"] = con4tuple.get_ratio_of_sizes()
 60 |                 features["percent_of_established_states"] = con4tuple.get_percent_of_established_states()
 61 |                 features["inbound_pckts"] = con4tuple.get_inbound_pckts()
 62 |                 features["outbound_pckts"] = con4tuple.get_outbound_pckts()
 63 |                 features["periodicity_average"] = con4tuple.get_periodicity_average()
 64 |                 features["periodicity_standart_deviation"] = con4tuple.get_periodicity_standart_deviation()
 65 |                 features["ssl_ratio"] = con4tuple.get_ssl_ratio()
 66 |                 features["average_public_key"] = con4tuple.get_average_public_key()
 67 |                 features["tls_version_ratio"] = con4tuple.get_tls_version_ratio()
 68 |                 features["average_of_certificate_length"] = con4tuple.get_average_of_certificate_length()
 69 |                 features["standart_deviation_cert_length"] = con4tuple.get_standart_deviation_cert_length()
 70 |                 features["is_valid_certificate_during_capture"] = con4tuple.is_valid_certificate_during_capture()
 71 |                 features["amount_diff_certificates"] = con4tuple.get_amount_diff_certificates()
 72 |                 features["number_of_domains_in_certificate"] = con4tuple.get_number_of_domains_in_certificate()
 73 |                 features["get_certificate_ratio"] = con4tuple.get_certificate_ratio()
 74 |                 features["number_of_certificate_path"] = con4tuple.get_number_of_certificate_path()
 75 |                 features["x509_ssl_ratio"] = con4tuple.x509_ssl_ratio()
 76 |                 features["SNI_ssl_ratio"] = con4tuple.SNI_ssl_ratio()
 77 |                 features["self_signed_ratio"] = con4tuple.self_signed_ratio()
 78 |                 features["is_SNIs_in_SNA_dns"] = con4tuple.is_SNIs_in_SNA_dns()
 79 |                 features["SNI_equal_DstIP"] = con4tuple.get_SNI_equal_DstIP()
 80 |                 features["is_CNs_in_SNA_dns"] = con4tuple.is_CNs_in_SNA_dns()
 81 | 
 82 |                 # New features
 83 |                 features["ratio_of_differ_SNI_in_ssl_log"] = con4tuple.ratio_of_differ_SNI_in_ssl_log()
 84 |                 features["ratio_of_differ_subject_in_ssl_log"] = con4tuple.ratio_of_differ_subject_in_ssl_log()
 85 |                 features["ratio_of_differ_issuer_in_ssl_log"] = con4tuple.ratio_of_differ_issuer_in_ssl_log()
 86 |                 features["ratio_of_differ_subject_in_cert"] = con4tuple.ratio_of_differ_subject_in_cert()
 87 |                 features["ratio_of_differ_issuer_in_cert"] = con4tuple.ratio_of_differ_issuer_in_cert()
 88 |                 features["ratio_of_differ_sandns_in_cert"] = con4tuple.ratio_of_differ_sandns_in_cert()
 89 |                 features["ratio_of_same_subjects"] = con4tuple.ratio_of_same_subjects()
 90 |                 features["ratio_of_same_issuer"] = con4tuple.ratio_of_same_issuer()
 91 |                 features["ratio_is_same_CN_and_SNI"] = con4tuple.ratio_is_same_CN_and_SNI()
 92 |                 features["average_certificate_exponent"] = con4tuple.average_certificate_exponent()
 93 |                 features["is_SNI_in_top_level_domain"] = con4tuple.is_SNI_in_top_level_domain()
 94 |                 features["ratio_certificate_path_error"] = con4tuple.ratio_certificate_path_error()
 95 |                 features["ratio_missing_cert_in_cert_path"] = con4tuple.ratio_missing_cert_in_cert_path()
 96 | 
 97 |                 # DNS Features
 98 |                 features.update(benchmark(dns_conn.compute_alexa_features))
 99 |                 features["FQDN_length"] = benchmark(dns_conn.get_FQDN_length)
100 |                 features["domain_name_length"] = benchmark(dns_conn.get_domain_name_length)
101 |                 features["number_of_numerical_chars"] = benchmark(dns_conn.get_number_of_numerical_chars)
102 |                 features["number_of_non_alphanumeric_chars"] = benchmark(
103 |                     dns_conn.get_number_of_non_alphanumeric_chars)
104 |                 features["number_unique_IP_addresses_in_response"] = benchmark(
105 |                     dns_conn.get_number_unique_IP_addresses_in_response)
106 |                 features["number_of_subdomains"] = benchmark(dns_conn.get_number_of_subdomains)
107 |                 features["average_ttls"] = benchmark(dns_conn.get_average_ttls)
108 |                 features["std_ttls"] = benchmark(dns_conn.get_std_ttls)
109 |                 features["min_ttls"] = benchmark(dns_conn.get_min_ttls)
110 |                 features["max_ttls"] = benchmark(dns_conn.get_max_ttls)
111 |                 features["number_of_hyphens_in_fqdn"] = benchmark(dns_conn.get_number_of_hyphens_in_fqdn)
112 |                 features["length_of_longest_subdomain_name"] = benchmark(
113 |                     dns_conn.get_length_of_longest_subdomain_name)
114 |                 features["number_of_voyels_in_fqdn"] = benchmark(dns_conn.get_number_of_voyels_in_fqdn)
115 |                 features["number_of_different_chars_in_fqdn"] = benchmark(
116 |                     dns_conn.get_number_of_different_chars_in_fqdn)
117 |                 features["number_of_consonants_in_fqdn"] = benchmark(dns_conn.get_number_of_consonants_in_fqdn)
118 |                 features["shannon_entropy_2ld"] = benchmark(dns_conn.get_shannon_entropy_2ld)
119 |                 features["shannon_entropy_3ld"] = benchmark(dns_conn.get_shannon_entropy_3ld)
120 | 
121 |                 features["label"] = con4tuple.get_label_of_connection()
122 | 
123 |                 if con4tuple.is_malware():
124 |                     botnet_lines.append(features)
125 |                 else:
126 |                     normal_lines.append(features)
127 | 
128 |         # Shuffle & balance the whole dataset (50-50 botnet/normal traffic)\n
129 |         from sklearn.utils import shuffle
130 | 
131 |         max_sample = min(len(botnet_lines), len(normal_lines))
132 | 
133 |         logger.info("Number of Conn3tuples (botnet, normal) : {}".format((len(botnet_lines),len(normal_lines))))
134 |         logger.info("Down-sampling to {} conn4tuples/class".format(max_sample))
135 | 
136 |         lines = shuffle(botnet_lines, n_samples=max_sample) + shuffle(normal_lines, n_samples=max_sample)
137 |         logger.info("Total dataset lines: {}".format(len(lines)))
138 | 
139 |         with open(c.model_folder + 'features.csv', 'wb') as csvfile:
140 |             writer = csv.DictWriter(csvfile, fieldnames=features.keys(), lineterminator='\n', delimiter=',',
141 |                                     quoting=csv.QUOTE_NONNUMERIC)
142 |             writer.writeheader()
143 |             writer.writerows(lines)
144 | 
145 |     def create_dataset_dns(self):
146 |         import csv
147 |         from collections import OrderedDict
148 | 
149 |         with open(c.model_folder + 'dns_features.csv', 'wb') as csvfile:
150 |             line = 0
151 | 
152 |             for key, dns_conn in self.dns_connections.iteritems():
153 |                 features = OrderedDict()
154 |                 features["key"] = key
155 |                 features.update(benchmark(dns_conn.compute_alexa_features))
156 |                 features["FQDN_length"] = benchmark(dns_conn.get_FQDN_length)
157 |                 features["domain_name_length"] = benchmark(dns_conn.get_domain_name_length)
158 |                 features["number_of_numerical_chars"] = benchmark(dns_conn.get_number_of_numerical_chars)
159 |                 features["number_of_non_alphanumeric_chars"] = benchmark(dns_conn.get_number_of_non_alphanumeric_chars)
160 |                 features["number_unique_IP_addresses_in_response"] = benchmark(
161 |                     dns_conn.get_number_unique_IP_addresses_in_response)
162 |                 features["number_of_subdomains"] = benchmark(dns_conn.get_number_of_subdomains)
163 |                 features["average_ttls"] = benchmark(dns_conn.get_average_ttls)
164 |                 features["std_ttls"] = benchmark(dns_conn.get_std_ttls)
165 |                 features["min_ttls"] = benchmark(dns_conn.get_min_ttls)
166 |                 features["max_ttls"] = benchmark(dns_conn.get_max_ttls)
167 |                 features["number_of_hyphens_in_fqdn"] = benchmark(dns_conn.get_number_of_hyphens_in_fqdn)
168 |                 features["length_of_longest_subdomain_name"] = benchmark(dns_conn.get_length_of_longest_subdomain_name)
169 |                 features["number_of_voyels_in_fqdn"] = benchmark(dns_conn.get_number_of_voyels_in_fqdn)
170 |                 features["number_of_different_chars_in_fqdn"] = benchmark(
171 |                     dns_conn.get_number_of_different_chars_in_fqdn)
172 |                 features["number_of_consonants_in_fqdn"] = benchmark(dns_conn.get_number_of_consonants_in_fqdn)
173 |                 features["shannon_entropy_2ld"] = benchmark(dns_conn.get_shannon_entropy_2ld)
174 |                 features["shannon_entropy_3ld"] = benchmark(dns_conn.get_shannon_entropy_3ld)
175 | 
176 |                 if line == 0:
177 |                     writer = csv.DictWriter(csvfile, fieldnames=features.keys(), lineterminator='\n', delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
178 |                     writer.writeheader()
179 | 
180 |                 writer.writerow(features)
181 |                 line += 1
182 | 
183 |     def save_dataset_information(self):
184 |         space = '	'
185 |         # with open("ExtractedData\\" + "conn_result.txt", 'w') as f:
186 |         with open(c.model_folder + "/dataset_info.txt", 'w') as f:
187 |             for key in self.dataset_information_dict.keys():
188 |                 f.write(str(key) + space +
189 |                         str(self.dataset_information_dict[key].ssl_lines) + space +
190 |                         str(self.dataset_information_dict[key].not_founded_x509_lines) + space +
191 |                         str(self.dataset_information_dict[key].founded_x509_lines) + space +
192 |                         str(self.dataset_information_dict[key].err_not_added_x509) +
193 |                         "\n")
194 |         f.close()
195 | 
196 | 
197 |     """
198 |     Statistic methods.
199 |     """
200 |     def print_statistic(self):
201 |         logger.info("-------------------------------------------")
202 |         logger.info("----------- Statistic ---------------------")
203 |         logger.info("-------------------------------------------")
204 |         malware_certificates_array = []
205 | 
206 |         normal_tuples = 0
207 |         malware_tuples = 0
208 |         flows_together = 0
209 |         flows_normal = 0
210 |         flows_malware = 0
211 |         cert_together = 0
212 |         cert_normal = 0
213 |         cert_malware = 0
214 |         for tuple_key in self.connection_4_tuples.keys():
215 |             conn_tuple = self.connection_4_tuples[tuple_key]
216 |             flows_together += conn_tuple.get_number_of_ssl_flows()
217 |             cert_together += len(conn_tuple.get_certificate_serial_dict().keys())
218 |             # More normal labels and malware labels in one 4-tuple ?
219 |             if conn_tuple.get_malware_label() != 0 and conn_tuple.get_normal_label() != 0:
220 |                 logger.error("Error: More labels in one 4-tuples")
221 |                 # Same amout of labels in one 4-tuple?
222 |                 if conn_tuple.get_malware_label() == conn_tuple.get_normal_label():
223 |                     logger.warning("Watch out: same amount of labels")
224 |                     logger.warning("Normal: {}".format(conn_tuple.get_normal_label()))
225 |                     logger.warning("Malware: {}".format(conn_tuple.get_malware_label()))
226 | 
227 |             if conn_tuple.is_malware():
228 |                 malware_tuples += 1
229 |                 flows_malware += conn_tuple.get_number_of_ssl_flows()
230 |                 cert_malware += len(conn_tuple.get_certificate_serial_dict().keys())
231 | 
232 |                 malware_certificates_array += conn_tuple.get_x509_list()
233 |             else:
234 |                 normal_tuples += 1
235 |                 flows_normal += conn_tuple.get_number_of_ssl_flows()
236 |                 cert_normal += len(conn_tuple.get_certificate_serial_dict().keys())
237 | 
238 |         logger.info("Connection 4-tuples:")
239 |         logger.info("All 4_tuples: {}".format(len(self.connection_4_tuples.keys())))
240 |         logger.info("Normal 4-tuples: {}".format(normal_tuples))
241 |         logger.info("Malware 4-tuples: {}".format(malware_tuples))
242 | 
243 |         logger.info("Flows")
244 |         logger.info("All gathered flows: {}".format(flows_together))
245 |         logger.info("Normal flows: {}".format(flows_normal))
246 |         logger.info("Malware flows: {}".format(flows_malware))
247 | 
248 |         logger.info("Certificates")
249 |         logger.info("All gathered certificates: {}".format(cert_together))
250 |         logger.info("Normal certificates: {}".format(cert_normal))
251 |         logger.info("Malware certificates: {}".format(cert_malware))
252 | 
253 |         # Save malware certificates.
254 |         self.save_malware_certificates(malware_certificates_array)
255 | 
256 |     def save_malware_certificates(self, x509_lines):
257 |         with open(c.model_folder + '/malware_certificates', 'w') as f:
258 |             for line in x509_lines:
259 |                 f.write(line + "\n")
260 |             f.close()
261 | 
262 | def benchmark(func, *params):
263 |     #import datetime
264 |     #import time
265 |     #start_time = time.time()
266 |     return_value = func(*params) if params else func()
267 |     #total_time = datetime.timedelta(seconds=time.time() - start_time)
268 |     #print("Function " + func.__name__ + " - execution time : " + str(total_time))#.strftime('%H:%M:%S'))
269 |     return return_value


--------------------------------------------------------------------------------
/features_extraction/ConnectionFeatures.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from Connection4tuple import Connection4tuple
  3 | 
  4 | 
  5 | class ConnectionFeatures(Connection4tuple):
  6 | 
  7 |     def __init__(self, tuple_index):
  8 |         super(ConnectionFeatures, self).__init__(tuple_index)
  9 | 
 10 |     """
 11 |     ---------- Get Feature -------------------
 12 |     """
 13 |     # ---------------------------------------------------
 14 |     # 01. ---------- Number of flows --------------------
 15 |     def get_number_of_flows(self):
 16 |         return self.get_number_of_ssl_flows() + self.get_number_of_not_ssl_flows()
 17 | 
 18 |     # ---------------------------------------------------
 19 |     # ---------- Duration of flows ----------------------
 20 |     # 02. Average
 21 |     def get_average_of_duration(self):
 22 |         # self.check_zero_dividing(self.flow_which_has_duration_number, "flow_which_has_duration_number is 0 !!!")
 23 |         if self.flow_which_has_duration_number != 0:
 24 |             return self.average_duration / float(self.flow_which_has_duration_number)
 25 |         return -1
 26 | 
 27 |     # 03. Standard deviation
 28 |     def get_standard_deviation_duration(self):
 29 |         # self.check_zero_dividing(self.flow_which_has_duration_number, "flow_which_has_duration_number is 0 !!!")
 30 |         # EX = self.average_duration / float(self.flow_which_has_duration_number)
 31 |         # EX2 = self.average_duration_power / float(self.flow_which_has_duration_number) # E(X^2)
 32 |         # DX = EX2 - EX*EX
 33 |         # return pow(DX, 0.5)
 34 |         if len(self.duration_list) != 0 and len(self.duration_list) > 2:
 35 |             return np.std(self.duration_list)
 36 |         return -1
 37 | 
 38 |     # 04. Percent of flows which are bigger or less than standard deviation with average
 39 |     def get_percent_of_standard_deviation_duration(self):
 40 |         # self.check_zero_dividing(self.flow_which_has_duration_number, "flow_which_has_duration_number is 0 !!!")
 41 |         if len(self.duration_list) != 0:
 42 |             out_of_bounds = 0
 43 |             lower_level = self.get_average_of_duration() - self.get_standard_deviation_duration()
 44 |             upper_level = self.get_average_of_duration() + self.get_standard_deviation_duration()
 45 |             for i in range(len(self.duration_list)):
 46 |                 if self.duration_list[i] < lower_level:
 47 |                     out_of_bounds += 1
 48 |                 elif self.duration_list[i] > upper_level:
 49 |                     out_of_bounds += 1
 50 | 
 51 |             return out_of_bounds / float(self.flow_which_has_duration_number)
 52 |         return -1
 53 | 
 54 |     # -------------------------------------------------------------------
 55 |     # 05 -------- Total payload size of flows the originator sent --------
 56 |     def get_total_size_of_flows_orig(self):
 57 |         return self.total_size_of_flows_orig
 58 | 
 59 |     # ------------------------------------------------------------------
 60 |     # 06 -------- Total payload size of flows the responder sent --------
 61 |     def get_total_size_of_flows_resp(self):
 62 |         return self.total_size_of_flows_resp
 63 | 
 64 |     # ---------------------------------------------------------------------------
 65 |     # 07 ------ Ratio of responder payload sizes and originator payload sizes ----
 66 |     def get_ratio_of_sizes(self):
 67 |         # self.check_zero_dividing(self.total_size_of_flows_orig, "Original size is 0 !!!")
 68 |         if self.total_size_of_flows_orig != 0:
 69 |             return self.total_size_of_flows_resp / float(self.total_size_of_flows_orig)
 70 |         return -1
 71 | 
 72 |     # --------------------------------------------------------------------
 73 |     # ------ State of connection -----------------------------------------
 74 |     # 08 Percent of established connection
 75 |     def get_percent_of_established_states(self):
 76 |         establihed_states = 0
 77 |         total_value_states = 0
 78 |         for key in self.state_of_connection_dict.keys():
 79 |             total_value_states += self.state_of_connection_dict[key]
 80 |         if total_value_states != 0:
 81 |             establihed_states += self.state_of_connection_dict.get('SF', 0)
 82 |             establihed_states += self.state_of_connection_dict.get('S1', 0)
 83 |             establihed_states += self.state_of_connection_dict.get('S2', 0)
 84 |             establihed_states += self.state_of_connection_dict.get('S3', 0)
 85 |             establihed_states += self.state_of_connection_dict.get('RSTO', 0)  # delete this
 86 |             establihed_states += self.state_of_connection_dict.get('RSTR', 0)  # delete this
 87 |             return (establihed_states / float(total_value_states))
 88 |         return -1
 89 | 
 90 |     """
 91 |     These functions are not used.
 92 |     """
 93 |     # 09 - return 4 items
 94 |     # def get_based_states_ratio(self):
 95 |     #     SF_S1 = self.state_of_connection_dict['SF'] + self.state_of_connection_dict['S1']
 96 |     #     S0 = self.state_of_connection_dict['S0']
 97 |     #     OTH = self.state_of_connection_dict['OTH']
 98 |     #     REJ = self.state_of_connection_dict['REJ']
 99 |     #     biggest = max(SF_S1, S0, OTH, REJ) / 100.0
100 |     #     return SF_S1 / float(biggest), S0 / float(biggest), OTH / float(biggest), REJ / float(biggest)
101 |     #
102 |     # # 10 - return 6 items
103 |     # def get_extended_states_ratio(self):
104 |     #     SF_S1 = self.state_of_connection_dict['SF'] + self.state_of_connection_dict['S1']
105 |     #     S0 = self.state_of_connection_dict['S0']
106 |     #     OTH = self.state_of_connection_dict['OTH']
107 |     #     REJ = self.state_of_connection_dict['REJ']
108 |     #     RSTO_1 = self.state_of_connection_dict['RSTO'] + self.state_of_connection_dict['RSTR'] + self.state_of_connection_dict['S2'] + self.state_of_connection_dict['S3']
109 |     #     RSTO_2 = self.state_of_connection_dict['RSTOS0'] + self.state_of_connection_dict['RSTRH'] + self.state_of_connection_dict['SH'] + self.state_of_connection_dict['SHR']
110 |     #     biggest = max(SF_S1, S0, OTH, REJ, RSTO_1, RSTO_2) / 100.0
111 |     #     return SF_S1 / float(biggest), S0 / float(biggest), OTH / float(biggest), REJ / float(biggest), RSTO_1 / float(biggest), RSTO_2 / float(biggest)
112 | 
113 |     # 11 inbound packets == resp_pkts (18)
114 |     # Number of packets that the responder sent.
115 |     def get_inbound_pckts(self):
116 |         return self.inbound_packtes
117 | 
118 |     # 12 outbound packets == orig_pkts (16)
119 |     def get_outbound_pckts(self):
120 |         return self.outbound_packtes
121 | 
122 |     # Periodicity
123 |     # 13 Average of periodicity
124 |     def get_periodicity_average(self):
125 |         per_list = self.get_periodicity_list()
126 |         sum = 0
127 |         for i in range(len(per_list)):
128 |             sum += per_list[i]
129 |         if len(per_list) != 0:
130 |             return sum / float(len(per_list))
131 |         # print "periodicity list is zero. Number of flows:", self.get_number_of_flows()
132 |         return -1
133 | 
134 |     # 14
135 |     def get_periodicity_standart_deviation(self):
136 |         per_list = self.get_periodicity_list()
137 |         if len(per_list) != 0 and len(per_list) > 2:
138 |             # sum = 0
139 |             # for i in range(len(per_list)):
140 |             #     sum += pow(per_list[i], 2)
141 |             # EX2 = sum / float(len(per_list))
142 |             # DX = EX2 - EX * EX
143 |             # return pow(DX, 0.5)
144 |             return np.std(self.get_periodicity_list())
145 |         return -1
146 | 
147 |     # -----------------------------------------------------
148 |     # 15 ------ Ratio of not ssl flows and ssl flows -------
149 |     def get_ssl_ratio(self):
150 |         self.check_zero_dividing(len(self.ssl_flow_list), "Original size is 0 !!!")
151 |         return len(self.not_ssl_flow_list) / float(len(self.ssl_flow_list))
152 | 
153 |     # 16 Average Public key lenghts
154 |     # certificate feature
155 |     def get_average_public_key(self):
156 |         total = 0
157 |         index = 0
158 |         for key in self.certificate_key_length_dict.keys():
159 |             total += self.certificate_key_length_dict[key] * int(key)
160 |             index += 1
161 |         if index != 0:
162 |             return total / float(index)
163 |         return -1
164 | 
165 |     # ------------------------------------------------------
166 |     # 17  Version of ssl ratio
167 |     def get_tls_version_ratio(self):
168 |         tls = 0
169 |         ssl = 0
170 |         total = 0
171 |         for key in self.version_of_ssl_dict.keys():
172 |             if 'tls' in key.lower():
173 |                 tls += self.version_of_ssl_dict[key]
174 |             elif 'ssl' in key.lower():
175 |                 ssl += self.version_of_ssl_dict[key]
176 |             total += self.version_of_ssl_dict[key]
177 |         if total != 0:
178 |             return tls / float(total)
179 |         return -1
180 | 
181 |     # ----------------------------------------------
182 |     # Certificate validation length
183 |     # 18 Average of certificate length
184 |     # certificate_valid_length = sum of certificate valid length in days
185 |     # certificate_valid_number = number of certificate*
186 |     def get_average_of_certificate_length(self):
187 |         # self.check_zero_dividing(self.certificate_valid_number, "certificate_valid_number is 0 !!!")
188 |         if self.certificate_valid_number != 0:
189 |             if np.mean(self.temp_list) != self.certificate_valid_length / float(self.certificate_valid_number):
190 |                 print "Error: numpy mean and mean by hand are not same."
191 |             return self.certificate_valid_length / float(self.certificate_valid_number)
192 |         return -1
193 | 
194 |     # 19
195 |     def get_standart_deviation_cert_length(self):
196 |         # self.check_zero_dividing(self.certificate_valid_number, "certificate_valid_number is 0 !!!")
197 |         if self.certificate_valid_number != 0:
198 |             EX = self.certificate_valid_length / self.certificate_valid_number
199 |             EX2 = self.certificate_valid_length_pow / self.certificate_valid_number
200 |             DX = EX2 - (EX * EX)
201 |             # if DX < 0:
202 |             #     print "EX:", (EX*EX)
203 |             #     print "EX2:", EX2
204 |             #     print "DX:", DX
205 |             #     print self.temp_list
206 |             #     print "std:", numpy.std(self.temp_list)
207 |             #     print len(self.x509_list)
208 |             return pow(DX, 0.5)
209 |         return -1
210 | 
211 |     # ---------------------------------------------
212 |     # 20 Validity of the certificate during the capture
213 |     # certificate feature
214 |     # 0 == no certficate was out of validity range
215 |     def is_valid_certificate_during_capture(self):
216 |         if len(self.cert_percent_validity) != 0:
217 |             return self.not_valid_certificate_number
218 |         return -1
219 | 
220 |     # 21 Amount of different certificates
221 |     # certificate feature
222 |     def get_amount_diff_certificates(self):
223 |         return len(self.certificate_serial_dict.keys())
224 | 
225 |     # -------------------------------------------------------
226 |     # 22 Number of domains in certificate
227 |     # certificate feature
228 |     def get_number_of_domains_in_certificate(self):
229 |         if self.number_san_domains_index != 0:
230 |             return self.number_san_domains / float(self.number_san_domains_index)
231 |         return -1
232 | 
233 |     # 23 Certificate ratio
234 |     # certificate feature
235 |     # List of length of certificate validity length.
236 |     def get_certificate_ratio(self):
237 |         if len(self.cert_percent_validity) != 0:
238 |             temp = 0
239 |             for value in self.cert_percent_validity:
240 |                 temp += value
241 |             return temp / float(len(self.cert_percent_validity))
242 |         else:
243 |             return -1
244 | 
245 |     # 24 Certificate path
246 |     # number of signed certificate in our first certificate
247 |     # It is EX (vazeny prumer)
248 |     def get_number_of_certificate_path(self):
249 |         up = 0
250 |         down = 0
251 |         for key in self.certificate_path.keys():
252 |             up += int(key) * self.certificate_path[key]
253 |             down += self.certificate_path[key]
254 |         if down != 0:
255 |             return up/float(down)
256 |         return -1
257 | 
258 |     # 25 x509/ssl ratio
259 |     # ratio about how many ssl log has x509 information in this connection
260 |     def x509_ssl_ratio(self):
261 |         if len(self.ssl_logs_list) == 0:
262 |             return -1
263 |         return len(self.x509_list) / float(len(self.ssl_logs_list))
264 | 
265 |     # 26 SNI and SSL ratio
266 |     # ratio, how many ssl flows have SNI (server name)
267 |     def SNI_ssl_ratio(self):
268 |         return self.ssl_with_SNI / float(len(self.ssl_logs_list))
269 | 
270 |     # 27 Self_signed cert and all cert ratio
271 |     def self_signed_ratio(self):
272 |         # number_of_certificate = len(self.certificate_serial_dict.keys())
273 |         if len(self.ssl_logs_list) != 0:
274 |             return self.self_signed_cert / float(len(self.ssl_logs_list))
275 |         return -1
276 | 
277 |     # 28 Is there any SNI, which not in san.dns ?
278 |     def is_SNIs_in_SNA_dns(self):
279 |         if len(self.is_SNI_in_san_dns) != 0:
280 |             for a in self.is_SNI_in_san_dns:
281 |                 if a == 0:
282 |                     return 0
283 |             return 1
284 |         return -1
285 | 
286 |     # 29 if SNI is IP, so dst is same ip?
287 |     def get_SNI_equal_DstIP(self):
288 |         return self.SNI_equal_DstIP
289 | 
290 |     # 30 Is there any CN, which not in san.dns ?
291 |     def is_CNs_in_SNA_dns(self):
292 |         if len(self.is_CN_in_SAN_list) != 0:
293 |             for a in self.is_CN_in_SAN_list:
294 |                 if a == 0:
295 |                     return 0
296 |             return 1
297 |         return -1
298 | 
299 | 
300 |     """
301 |     -----------------  New Features ------------------ 
302 |     """
303 |     # 31 How many ssl lines has different SNI ?
304 |     def ratio_of_differ_SNI_in_ssl_log(self):
305 |         # Delete stars.
306 |         for i in range(0, len(self.SNI_list)):
307 |             if '*' in self.SNI_list[i]:
308 |                 self.SNI_list[i] = self.SNI_list[i].replace('*', '')
309 | 
310 |         return compute_differents_in_lines(self.SNI_list)
311 | 
312 |     # 32 How many ssl lines has different subject
313 |     def ratio_of_differ_subject_in_ssl_log(self):
314 |         return compute_differents_in_lines(self.subject_ssl_list)
315 | 
316 |     # 33 How many ssl lines has differ issuer
317 |     def ratio_of_differ_issuer_in_ssl_log(self):
318 |         return compute_differents_in_lines(self.issuer_ssl_list)
319 | 
320 |     # 34 How many cert has differ subject
321 |     def ratio_of_differ_subject_in_cert(self):
322 |         return compute_differents_in_lines(self.subject_x509_list)
323 | 
324 |     # 35 How many cert has differ issuer
325 |     def ratio_of_differ_issuer_in_cert(self):
326 |         return compute_differents_in_lines(self.issuer_x509_list)
327 | 
328 |     # 36 How many cert has differ san dns
329 |     def ratio_of_differ_sandns_in_cert(self):
330 |         return compute_differents_in_lines(self.san_x509_list)
331 | 
332 |     # 37 Do ssl and x509 lines have same subjects?
333 |     def ratio_of_same_subjects(self):
334 |         if len(self.x509_list) == 0:
335 |             return -1
336 |         return self.subject_diff / float(len(self.x509_list))
337 | 
338 |     # 38 Do ssl and x509 lines have same issuer?
339 |     def ratio_of_same_issuer(self):
340 |         if len(self.x509_list) == 0:
341 |             return -1
342 |         return self.issuer_diff / float(len(self.x509_list))
343 | 
344 |     # 39 Is SNI and CN same?
345 |     def ratio_is_same_CN_and_SNI(self):
346 |         if len(self.x509_list) == 0:
347 |             return -1
348 |         return self.SNI_is_in_CN / float(len(self.x509_list))
349 | 
350 |     # 40 Certificate exponent average
351 |     def average_certificate_exponent(self):
352 |         if len(self.certificate_serial_dict.keys()) == 0:
353 |             return -1
354 |         return self.certificate_exponent / float(len(self.certificate_serial_dict.keys()))
355 | 
356 |     # 41 Is server name in top-level-domain ?
357 |     def is_SNI_in_top_level_domain(self):
358 |         if self.ssl_with_SNI == 0:
359 |             return -1
360 |         return self.top_level_domain_error / float(self.ssl_with_SNI)
361 | 
362 |     # 42 Is certificate path right ? (issuer of first certificate is subject in second cert...)
363 |     def ratio_certificate_path_error(self):
364 |         if len(self.ssl_logs_list):
365 |             return -1
366 |         return self.certificate_path_error / float(len(self.ssl_logs_list))
367 | 
368 |     # 43 Missing certificate in certificate path.
369 |     def ratio_missing_cert_in_cert_path(self):
370 |         if len(self.ssl_logs_list):
371 |             return -1
372 |         return self.missing_cert_in_cert_path / float(len(self.ssl_logs_list))
373 | 
374 | 
375 | """
376 | ------- Computation method ---------
377 | """
378 | def compute_differents_in_lines(array):
379 |     _dict = dict()
380 |     for item in array:
381 |         try:
382 |             _dict[item] += 1
383 |         except:
384 |             _dict[item] = 1
385 | 
386 |     if len(array) == 0:
387 |         return -1.0
388 |     if len(_dict.keys()) == 1:
389 |         return 0.0
390 |     return len(_dict.keys()) / float(len(array))


--------------------------------------------------------------------------------
/features_extraction/DNSConnection.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This class stores all information for DNS records that have the same Domain name => called one DNSConnection
 3 | """
 4 | 
 5 | 
 6 | class DNSConnection(object):
 7 | 
 8 |     def __init__(self, FQDN):
 9 |         self.FQDN = FQDN
10 |         self.subdomains = self.FQDN.split('.')
11 |         self.domain_name = '.'.join(self.subdomains[-2:])
12 |         self.dns_records = list()
13 |         self.ttls = list()
14 |         self.answers = set()


--------------------------------------------------------------------------------
/features_extraction/DNSFeatures.py:
--------------------------------------------------------------------------------
  1 | from DNSConnection import DNSConnection
  2 | import string
  3 | import csv
  4 | from collections import OrderedDict
  5 | import config as c
  6 | import numpy as np
  7 | 
  8 | 
  9 | class DNSFeatures(DNSConnection):
 10 |     alexa_top100 = list()
 11 |     alexa_top1k = list()
 12 |     alexa_top10k = list()
 13 |     alexa_top100k = list()
 14 |     alexa_top1m = list()
 15 | 
 16 |     def __init__(self, index):
 17 |         super(DNSFeatures, self).__init__(index)
 18 | 
 19 |     @staticmethod
 20 |     def get_alexa(filename):
 21 |         with open(filename, 'rb') as csvfile:
 22 |             csvreader = csv.reader(csvfile, delimiter=' ', quoting=csv.QUOTE_MINIMAL)
 23 |             return csvreader.next()
 24 | 
 25 |     @staticmethod
 26 |     def load_all_top_alexa():
 27 |         DNSFeatures.alexa_top100 = DNSFeatures.get_alexa(c.alexa_folder + "alexa_top100.csv")
 28 |         DNSFeatures.alexa_top1k = DNSFeatures.get_alexa(c.alexa_folder + "alexa_top1k.csv")
 29 |         DNSFeatures.alexa_top10k = DNSFeatures.get_alexa(c.alexa_folder + "alexa_top10k.csv")
 30 |         DNSFeatures.alexa_top100k = DNSFeatures.get_alexa(c.alexa_folder + "alexa_top100k.csv")
 31 |         DNSFeatures.alexa_top1m = DNSFeatures.get_alexa(c.alexa_folder + "alexa_top1m.csv")
 32 | 
 33 | 
 34 | 
 35 |     def add_dns_record(self, dns_record):
 36 |         self.dns_records.append(dns_record)
 37 |         self.compute_classic_features(dns_record)
 38 | 
 39 |     def compute_classic_features(self, dns_record):
 40 | 
 41 |         if dns_record["answers"] != '-':
 42 |             self.answers.update(filter(is_ipv4, dns_record["answers"].split(',')))
 43 |         if dns_record["TTLs"] != '-':
 44 |             self.ttls += (map(float, dns_record["TTLs"].split(',')))
 45 | 
 46 | 
 47 | 
 48 |     ############
 49 |     # Features #
 50 |     ############
 51 | 
 52 |     ############ Anderson
 53 | 
 54 |     # -----------------------------------------------
 55 |     # 00. ---------- FQDN Length --------------------
 56 |     def get_FQDN_length(self):
 57 |         return len(self.FQDN)
 58 | 
 59 |     # -----------------------------------------------
 60 |     # 00. ---------- Domain name Length --------------------
 61 |     def get_domain_name_length(self):
 62 |         return len(self.domain_name)
 63 | 
 64 |     # ------------------------------------------------------------------
 65 |     # 00. ---------- number of numerical characters --------------------
 66 |     def get_number_of_numerical_chars(self):
 67 |         return len(filter(lambda c: c in string.digits, self.FQDN))
 68 | 
 69 |     # ------------------------------------------------------------------
 70 |     # 00. ---------- number of non-alphanumeric characters --------------------
 71 |     def get_number_of_non_alphanumeric_chars(self):
 72 |         alpha = string.ascii_letters + string.digits
 73 |         return len(filter(lambda c: c not in alpha and c != '.', self.FQDN))
 74 | 
 75 |     # ------------------------------------------------------------------
 76 |     # 00. ---------- alexa features --------------------
 77 |     def compute_alexa_features(self):
 78 |         alexa_features = OrderedDict()
 79 | 
 80 |         alexa_features["in_alexa_top100"] = 0
 81 |         alexa_features["in_alexa_top1k"] = 0
 82 |         alexa_features["in_alexa_top10k"] = 0
 83 |         alexa_features["in_alexa_top100k"] = 0
 84 |         alexa_features["in_alexa_top1m"] = 0
 85 |         alexa_features["not_in_alexa"] = 0
 86 | 
 87 |         if binarySearch(DNSFeatures.alexa_top100, self.domain_name):
 88 |             alexa_features["in_alexa_top100"] = 1
 89 |         elif binarySearch(DNSFeatures.alexa_top1k, self.domain_name):
 90 |             alexa_features["in_alexa_top1k"] = 1
 91 |         elif binarySearch(DNSFeatures.alexa_top10k, self.domain_name):
 92 |             alexa_features["in_alexa_top10k"] = 1
 93 |         elif binarySearch(DNSFeatures.alexa_top100k, self.domain_name):
 94 |             alexa_features["in_alexa_top100k"] = 1
 95 |         elif binarySearch(DNSFeatures.alexa_top1m, self.domain_name):
 96 |             alexa_features["in_alexa_top1m"] = 1
 97 |         else:
 98 |             alexa_features["not_in_alexa"] = 1
 99 |         return alexa_features
100 | 
101 |     ######### Mine
102 | 
103 |     # ------------------------------------------------------------------
104 |     # 00. ---------- number of unique IP addresses in response --------------------
105 |     def get_number_unique_IP_addresses_in_response(self):
106 |         return len(self.answers)
107 | 
108 |     # ------------------------------------------------------------------
109 |     # 00. ---------- number of subdomains --------------------
110 |     def get_number_of_subdomains(self):
111 |         return len(self.FQDN.split('.'))
112 | 
113 |     # ------------------------------------------------------------------
114 |     # 00. ---------- average TTLs --------------------
115 |     def get_average_ttls(self):
116 |         if len(self.ttls) > 0:
117 |             return sum(self.ttls) / len(self.ttls)
118 |         else:
119 |             return -1
120 | 
121 |     # ------------------------------------------------------------------
122 |     # 00. ---------- std TTLs --------------------
123 |     def get_std_ttls(self):
124 |         if len(self.ttls) > 2:
125 |             return np.std(self.ttls)
126 |         else:
127 |             return -1
128 | 
129 |     # ------------------------------------------------------------------
130 |     # 00. ---------- min TTLs --------------------
131 |     def get_min_ttls(self):
132 |         return min(self.ttls) if len(self.ttls) > 0 else -1
133 | 
134 |     # ------------------------------------------------------------------
135 |     # 00. ---------- max TTLs --------------------
136 |     def get_max_ttls(self):
137 |         return max(self.ttls) if len(self.ttls) > 0 else -1
138 | 
139 |     # ------------------------------------------------------------------
140 |     # 00. ---------- number of hyphens in fqdn--------------------
141 |     def get_number_of_hyphens_in_fqdn(self):
142 |         return len(filter(lambda c: c == "-", self.FQDN))
143 | 
144 |     # ------------------------------------------------------------------
145 |     # 00. ---------- length of the longest subdomain name--------------------
146 |     def get_length_of_longest_subdomain_name(self):
147 |         return max(map(len, self.FQDN.split('.')))
148 | 
149 |     # ------------------------------------------------------------------
150 |     # 00. ---------- number of voyels --------------------
151 |     def get_number_of_voyels_in_fqdn(self):
152 |         voyels = "aeioue"
153 |         return len(filter(lambda c: c in voyels, self.FQDN))
154 | 
155 |     # ------------------------------------------------------------------
156 |     # 00. ---------- number of different chars in fqdn --------------------
157 |     def get_number_of_different_chars_in_fqdn(self):
158 |         chars = set()
159 |         for c in self.FQDN:
160 |             if c != ".":
161 |                 chars.add(c)
162 |         return len(chars)
163 | 
164 |     # ------------------------------------------------------------------
165 |     # 00. ---------- number of consonants --------------------
166 |     def get_number_of_consonants_in_fqdn(self):
167 |         consonants = "zrtypqsdfghjklmwxcvbn"
168 |         return len(filter(lambda c: c in consonants, self.FQDN))
169 | 
170 |     # ------------------------------------------------------------------
171 |     # 00. ---------- shannon entropy on 2ld --------------------
172 |     def get_shannon_entropy_2ld(self):
173 |         try:
174 |             ent = entropy(self.subdomains[-2])
175 |         except IndexError:
176 |             print self.FQDN
177 |             print self.subdomains
178 |             raise
179 |         return ent
180 | 
181 |     # ------------------------------------------------------------------
182 |     # 00. ---------- shannon entropy on 3ld --------------------
183 |     def get_shannon_entropy_3ld(self):
184 |         if len(self.subdomains) > 2:
185 |             return entropy(self.subdomains[-3])
186 |         else:
187 |             return -1
188 | 
189 | 
190 | 
191 | # UTILITIES
192 | 
193 | def binarySearch(alist, item):
194 |     first = 0
195 |     last = len(alist)-1
196 |     found = False
197 | 
198 |     while first<=last and not found:
199 |         pos = 0
200 |         midpoint = (first + last)//2
201 |         if alist[midpoint] == item:
202 |             pos = midpoint
203 |             found = True
204 |         else:
205 |             if item < alist[midpoint]:
206 |                 last = midpoint-1
207 |             else:
208 |                 first = midpoint+1
209 |     return found
210 | 
211 | 
212 | def entropy(str):
213 |     import math
214 |     "Calculates the Shannon entropy of a string"
215 | 
216 |     # get probability of chars in string
217 |     prob = [float(str.count(c)) / len(str) for c in dict.fromkeys(list(str))]
218 | 
219 |     # calculate the entropy
220 |     entropy = - sum([p * math.log(p) / math.log(2.0) for p in prob])
221 | 
222 |     return entropy
223 | 
224 | def is_ipv4(str):
225 |     l = str.split('.')
226 |     if len(l) != 4:
227 |         return False
228 |     try:
229 |         ip = map(int, l)
230 |     except ValueError:
231 |         return False
232 |     if len(filter(lambda x: 0 <= x <= 255, ip)) == 4:
233 |         return True
234 |     return False


--------------------------------------------------------------------------------
/features_extraction/DatasetInformation.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class DatasetInformation:
 4 | 
 5 |     def __init__(self, ssl_lines, not_founded_x509_lines, err_not_added_x509, founded_x509_lines):
 6 |         self.ssl_lines = ssl_lines
 7 |         self.not_founded_x509_lines = not_founded_x509_lines
 8 |         self.founded_x509_lines = founded_x509_lines
 9 |         self.err_not_added_x509 = err_not_added_x509
10 | 


--------------------------------------------------------------------------------
/features_extraction/ExtractFeatures.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from ConnectionFeatures import ConnectionFeatures
  3 | from DatasetInformation import DatasetInformation
  4 | from CertificateFeatures import CertificateFeatures
  5 | from DNSFeatures import DNSFeatures
  6 | 
  7 | from logger import get_logger
  8 | logger = get_logger('debug')
  9 | 
 10 | 
 11 | class ExtractFeatures(object):
 12 | 
 13 |     def __init__(self):
 14 |         self.connection_4_tuples = dict()
 15 | 
 16 |         self.x509_dict = dict()
 17 |         self.control_ssl_uids_dict = dict()
 18 | 
 19 |         self.number_conn_lines = 0
 20 |         self.conn_dict = dict()
 21 | 
 22 |         self.err_conn_uids = 0
 23 |         self.err_more_same_X509 = 0
 24 |         self.err_not_added_x509 = 0
 25 | 
 26 |         self.ssl_lines = 0
 27 |         self.not_founded_x509_lines = 0
 28 |         self.founded_x509_lines = 0
 29 | 
 30 |         self.certificate_dict = dict()
 31 | 
 32 |         self.dataset_information_dict = dict()
 33 | 
 34 |         self.dns_lines = 0
 35 |         self.dns_connections = dict()
 36 |         self.dns_connections_index = dict() # keys = IPs, values = dns_connections    Note : multiple keys can refer to the same dns_connection
 37 | 
 38 |     def extraction_manager(self, dataset_path_to_logs):
 39 |         # Loads all conn logs in bro folder.
 40 |         self.conn_logs(dataset_path_to_logs)
 41 |         # Loads all x509 logs in bro folder.
 42 |         self.x509_logs(dataset_path_to_logs)
 43 |         # Load all ssl logs.
 44 |         self.ssl_logs(dataset_path_to_logs)
 45 |         # Find not ssl lines in conn.logs that belong to created conn 4 tuples.
 46 |         self.conn_logs_2(dataset_path_to_logs)
 47 |         # Load all dns logs.
 48 |         self.dns_logs(dataset_path_to_logs)
 49 | 
 50 |         logger.info("SSL Lines: {}".format(self.ssl_lines))
 51 |         logger.info("Not founded x509 lines: {}".format(self.not_founded_x509_lines))
 52 |         logger.info("Not '-' x509 lines: {}".format(self.err_not_added_x509))
 53 |         logger.info("Founded x509 lines: {}".format(self.founded_x509_lines))
 54 | 
 55 |         dataset_info = DatasetInformation(self.ssl_lines, self.not_founded_x509_lines, self.err_not_added_x509, self.founded_x509_lines)
 56 |         self.dataset_information_dict[dataset_path_to_logs] = dataset_info
 57 | 
 58 |         self.ssl_lines = 0
 59 |         self.not_founded_x509_lines = 0
 60 |         self.founded_x509_lines = 0
 61 |         self.err_not_added_x509 = 0
 62 | 
 63 |     """
 64 |     ---------------------- Conn logs. -------------------------
 65 |     """
 66 |     def conn_logs(self, dataset_path_to_logs):
 67 |         logger.info("loading conn logs...")
 68 |         print " << Read all conn logs:"
 69 |         print "Reading conn logs:"
 70 |         self.number_conn_lines = 0
 71 |         all_conn_logs = get_such_logs(dataset_path_to_logs, ['conn', '_label'])
 72 |         for conn_log in all_conn_logs:
 73 |             self.read_conn_log(dataset_path_to_logs + conn_log)
 74 |         print "     << Loaded conn logs: ", len(all_conn_logs)
 75 | 
 76 |     def read_conn_log(self, dataset_path_to_conn):
 77 |         try:
 78 |             with open(dataset_path_to_conn) as f:
 79 |                 for line in f:
 80 |                     if line[0] == '#':
 81 |                         continue
 82 |                     split_conn_line = line.split('\t')
 83 |                     conn_uid = split_conn_line[1]
 84 | 
 85 |                     if len(split_conn_line) < 22:
 86 |                         continue
 87 | 
 88 |                     label = split_conn_line[21]
 89 | 
 90 |                     if 'Background' in label or 'No_Label' in label:
 91 |                         continue
 92 | 
 93 |                     try:
 94 |                         if self.conn_dict[conn_uid]:
 95 |                             print "Error: more same conn line !"
 96 |                     except:
 97 |                         self.conn_dict[conn_uid] = line
 98 | 
 99 |                     if "#close" in line:
100 |                         break
101 | 
102 |             f.close()
103 |         except IOError:
104 |             logger.error("Error: The conn file: {} does not exist.".format(dataset_path_to_conn))
105 | 
106 |     """
107 |     --------------------- X509 logs. ------------------------
108 |     """
109 |     def x509_logs(self, dataset_path_to_logs):
110 |         logger.info("loading x509 logs...")
111 |         print "<< Read all x509 logs:"
112 |         # Clear x509_dict()
113 |         self.x509_dict = dict()
114 |         all_x509_logs = get_such_logs(dataset_path_to_logs, ['x509'])
115 |         print "num x509 logs:", len(all_x509_logs)
116 |         for x509_log in all_x509_logs:
117 |             self.read_x509_log(dataset_path_to_logs, x509_log)
118 |         print "     << Loaded x509 logs: ", len(all_x509_logs)
119 | 
120 |     def read_x509_log(self, dataset_path_to_logs, x509_log):
121 |         """
122 |         Read started_file.txt where is time when capture of this dataset starts. Some datasets have starting
123 |         time 1.1. 1970 00:00:00. So we have to add to time.
124 |         If this file does not exist, dataset has right value time.
125 |         """
126 |         # go to parent folder, because 'started_file.txt' is saved in sub folder. Not in bro folder.
127 |         sub_folder = os.path.dirname(dataset_path_to_logs)
128 |         started_unix_time = 0.0
129 |         try:
130 |             with open(sub_folder + "/start_date.txt") as f:
131 |                 started_unix_time = float(f.readlines()[1])
132 |                 print "     << Started unix time file was read in:", sub_folder
133 |             f.close()
134 |         except IOError:
135 |             # It means that this dataset has right time format.
136 |             pass
137 | 
138 |         try:
139 |             with open(dataset_path_to_logs + x509_log) as f:
140 |                 # go throw ssl file line by line and for each ssl line check all uid of flows
141 |                 for line in f:
142 |                     if '#' == line[0]:
143 |                         continue
144 |                     x509_split = line.split('	')
145 | 
146 |                     """
147 |                     Change time, because some datasets are from 1.1 1970 00:00:00.
148 |                     """
149 |                     time_new = float(x509_split[0]) + started_unix_time
150 |                     new_line = str(time_new)
151 |                     for i in range(1, len(x509_split)):
152 |                         new_line += '	' + x509_split[i]
153 |                     x509_uid = x509_split[1]
154 |                     try:
155 |                         if self.x509_dict[x509_uid]:
156 |                             self.err_more_same_X509 += 1
157 |                             # print "Error: [read_x509_log] more uids in x509!!!", x509_uid,\
158 |                             #     " and path is: " + dataset_path_to_logs + x509_log
159 |                     except:
160 |                         self.x509_dict[x509_uid] = new_line
161 |             f.close()
162 |         except IOError:
163 |             logger.error("Error: The x509 file: " + dataset_path_to_logs + x509_log + " does not exist.")
164 | 
165 |     """
166 |     --------------------- SSL logs. ------------------------
167 |     """
168 |     def ssl_logs(self, dataset_path_to_logs):
169 |         print "<< Read all ssl logs::"
170 |         self.control_ssl_uids_dict = dict()
171 |         all_ssl_logs = get_such_logs(dataset_path_to_logs, ['ssl'])
172 |         for ssl_log in all_ssl_logs:
173 |             self.create_4_tuples(dataset_path_to_logs + ssl_log)
174 |         print "     << Loaded ssl logs: ", len(all_ssl_logs)
175 | 
176 |     def create_4_tuples(self, path_to_ssl_log):
177 | 
178 |         with open(path_to_ssl_log) as ssl_file:
179 |             for ssl_line in ssl_file:
180 |                 if '#' == ssl_line[0]:
181 |                     continue
182 | 
183 |                 ssl_split = ssl_line.split('	')
184 |                 ssl_uid = ssl_split[1]
185 | 
186 |                 # if same ssl, continue (in some ssl.log files are more same ssl lines. It is probably bro error)
187 |                 try:
188 |                     if self.control_ssl_uids_dict[ssl_uid]:
189 |                         if ssl_line == self.control_ssl_uids_dict[ssl_uid]:
190 |                             continue
191 |                         else:
192 |                             old_ssl_split = self.control_ssl_uids_dict[ssl_uid].split('	')
193 |                             new_ssl_split = ssl_line.split('	')
194 |                             for i in range(0, len(old_ssl_split)):
195 |                                 if i <= 20:
196 |                                     if old_ssl_split[i] != new_ssl_split[i]:
197 |                                         logger.erro("SSL Error - ssl lines with same uid are not same! Path: {} SSL uid: {}".format(path_to_ssl_log, ssl_uid))
198 |                             continue
199 |                 except:
200 |                     self.control_ssl_uids_dict[ssl_uid] = ssl_line
201 | 
202 |                 # find flow in conn.log by this ssl uid.
203 |                 try:
204 |                     conn_log = self.conn_dict[ssl_uid]
205 |                 except:
206 |                     # conn_dict contains only normal or malware conn lines. Here there are read all ssl lines and
207 |                     # some ssl lines shows to background conn_line that are not contained in conn_dict.
208 |                     continue
209 | 
210 |                 conn_split = conn_log.split('	')
211 |                 # 2-srcIpAddress, 4-dstIpAddress, 5-dstPort, 6-Protocol
212 |                 connection_index = conn_split[2], conn_split[4], conn_split[5], conn_split[6]
213 | 
214 |                 try:
215 |                     label = conn_split[21]
216 |                 except IndexError:
217 |                     logger.error("Error: no label in conn line. conn index: {}".format(connection_index))
218 | 
219 |                 if 'Background' in label or 'No_Label' in label:
220 |                     logger.error("Error: Backgroung label. conn index: {}".format(connection_index))
221 |                     continue
222 | 
223 |                 if not ('Botnet' in label) and not ('Normal') in label:
224 |                     logger.error("Error: Dear more, there are more states of labels !!!! conn index: {}".format(connection_index))
225 | 
226 |                 try:
227 |                     self.connection_4_tuples[connection_index].add_ssl_flow(conn_log, label)
228 |                 except:
229 |                     self.connection_4_tuples[connection_index] = ConnectionFeatures(connection_index)
230 |                     self.connection_4_tuples[connection_index].add_ssl_flow(conn_log, label)
231 | 
232 |                 self.ssl_lines += 1
233 |                 # x509 and ssl
234 |                 valid_x509_list = self.split_ssl(ssl_line, connection_index, label)
235 | 
236 |                 self.connection_4_tuples[connection_index].add_ssl_log(ssl_line, valid_x509_list,
237 |                                                                        os.path.basename(path_to_ssl_log))
238 | 
239 |                 # For chceking certificate path, find x509 logs in cert path.
240 |                 ssl_split = ssl_line.split('	')
241 |                 list_of_x509_uids = ssl_split[14].split(',')
242 |                 x509_lines_arr = []
243 |                 is_founded = True
244 |                 for x509_uid in list_of_x509_uids:
245 |                     try:
246 |                         if self.x509_dict[x509_uid]:
247 |                             x509_lines_arr.append(self.x509_dict[x509_uid])
248 |                     except:
249 |                         is_founded = False
250 |                         # break makes an error here.
251 |                 self.connection_4_tuples[connection_index].check_certificate_path(x509_lines_arr, is_founded)
252 | 
253 |         ssl_file.close()
254 | 
255 |     '''
256 |     Methods for adding not ssl flow from conn.log to connection-4tuple
257 |     '''
258 | 
259 |     def conn_logs_2(self, dataset_path_to_logs):
260 |         print " << Read all conn logs again:"
261 |         all_conn_logs = get_such_logs(dataset_path_to_logs, ['conn', '_label'])
262 |         for conn_log in all_conn_logs:
263 |             self.add_not_ssl_logs(dataset_path_to_logs + conn_log)
264 |         print "     << Loaded conn logs 2: ", len(all_conn_logs)
265 | 
266 |     def add_not_ssl_logs(self, path_to_conn):
267 |         print "     <<< adding not ssl flow:"
268 |         with open(path_to_conn) as f:
269 |             for line in f:
270 |                 if '#' == line[0]:
271 |                     continue
272 |                 conn_split = line.split('	')
273 |                 # 2-srcIpAddress, 4-dstIpAddress, 5-dstPort, 6-Protocol
274 |                 if len(conn_split) < 7:
275 |                     continue
276 | 
277 |                 connection_index = conn_split[2], conn_split[4], conn_split[5], conn_split[6]
278 |                 try:
279 |                     label = conn_split[21]
280 |                 except IndexError:
281 |                     label = "False"
282 |                 conn_uid = conn_split[1]
283 | 
284 |                 if 'Background' in label or 'No_Label' in label:
285 |                     continue
286 | 
287 |                 try:
288 |                     if self.connection_4_tuples[connection_index]:
289 |                         try:
290 |                             if self.connection_4_tuples[connection_index].get_uid_flow_dict()[conn_uid]:
291 |                                 pass
292 |                         except:
293 |                             self.connection_4_tuples[connection_index].add_not_ssl_flow(line, label)
294 |                 except:
295 |                     # Connections which are normal or botnet but they don't have ssl 4-tuple object.
296 |                     pass
297 |         f.close()
298 | 
299 |     """
300 |         ---------------------- DNS logs. -------------------------
301 |         """
302 | 
303 |     def dns_logs(self, dataset_path_to_logs):
304 |         logger.info("loading dns logs...")
305 |         print " << Read all dns logs:"
306 |         print "Reading dns logs:"
307 |         self.dns_lines = 0
308 |         all_dns_logs = get_such_logs(dataset_path_to_logs, ['dns'])
309 |         for dns_log in all_dns_logs:
310 |             self.read_dns_log(dataset_path_to_logs + dns_log)
311 | 
312 |         print "     << Loaded dns logs: ", len(all_dns_logs)
313 | 
314 |     def read_dns_log(self, dataset_path_to_dns):
315 |         try:
316 |             with open(dataset_path_to_dns) as f:
317 |                 for line in f:
318 |                     split_dns_line = line.split('\t')
319 |                     if split_dns_line[0] == "#fields":
320 |                         headers = split_dns_line[1:]
321 |                         continue
322 |                     elif line[0] == '#':
323 |                         continue
324 | 
325 |                     dns_record = dict(zip(headers, split_dns_line))
326 | 
327 |                     unknown_domain_names = ["(empty)", "immutableset"]
328 |                     if (dns_record['qtype_name'] == 'A' or dns_record['qtype_name'] == 'AAAA') and \
329 |                             dns_record['query'] not in unknown_domain_names and '.' in dns_record['query']:
330 |                         dns_index = dns_record['query']
331 |                         if dns_index in self.dns_connections:
332 |                             self.dns_connections[dns_index].add_dns_record(dns_record)
333 |                             for ip in self.dns_connections[dns_index].answers:
334 |                                 self.dns_connections_index[ip] = self.dns_connections[dns_index]
335 |                         else:
336 |                             self.dns_connections[dns_index] = DNSFeatures(dns_index)
337 |                             self.dns_connections[dns_index].add_dns_record(dns_record)
338 |                             for ip in self.dns_connections[dns_index].answers:
339 |                                 self.dns_connections_index[ip] = self.dns_connections[dns_index]
340 | 
341 |                         self.dns_lines += 1
342 | 
343 |             f.close()
344 |         except IOError:
345 |             logger.error("Error: The dns file: {} does not exist.".format(dataset_path_to_dns))
346 | 
347 |     """
348 |     ------------------------------------------------
349 |     --------------- Methods ------------------------
350 |     ------------------------------------------------
351 |     """
352 | 
353 |     '''
354 |     Just checking function, that each x509uid from ssl log is found in x509 file.
355 |     '''
356 |     def split_ssl(self, ssl_line, tuple_index, label):
357 |         split = ssl_line.split('	')
358 |         if '-' == split[14] or '(object)' == split[14]:
359 |             self.err_not_added_x509 += 1
360 |             return []
361 |         self.put_server_name_to_dict(split[1], split[9], tuple_index, split[14], label)
362 |         return self.get_x509_lines(split[14].split(','))
363 | 
364 |     '''
365 |     This function returns x509 line which ssl log has inside his line as list of uid.
366 |     '''
367 |     def get_x509_lines(self, x509_uids_list):
368 |         x509_line = None
369 |         uid_x509 = x509_uids_list[0]
370 |         try:
371 |             if self.x509_dict[uid_x509]:
372 |                 x509_line = self.x509_dict[uid_x509]
373 |                 self.founded_x509_lines += 1
374 |         except:
375 |             self.not_founded_x509_lines += 1
376 |             return []
377 |             # print "Error: [get_x509_lines] In ProcessLogs.py x509 does not have this x509uid:", x509_uids_list[0]
378 |         return [x509_line]
379 | 
380 |     # certificate dict
381 |     def put_server_name_to_dict(self, ssl_uid, server_name, tuple_index, x509_uids_list, label):
382 |         splited_x509_uids = x509_uids_list.split(',')
383 |         uid_x509 = splited_x509_uids[0]
384 |         try:
385 |             if self.x509_dict[uid_x509]:
386 |                 x509_line = self.x509_dict[uid_x509]
387 |                 x509_split = x509_line.split('	')
388 |                 cert_serial = x509_split[3]
389 |                 try:
390 |                     if self.certificate_dict[cert_serial]:
391 |                         self.certificate_dict[cert_serial].add_server_name(server_name, label)
392 |                         self.certificate_dict[cert_serial].add_x509_line(x509_line)
393 |                 except:
394 |                     self.certificate_dict[cert_serial] = CertificateFeatures(cert_serial, x509_line)
395 |                     self.certificate_dict[cert_serial].add_server_name(server_name, label)
396 |                     self.certificate_dict[cert_serial].add_x509_line(x509_line)
397 |         except:
398 |             logger.error("Error: [put_server_name] In ProcessLogs.py x509 does not have this x509uid: {}".format(uid_x509))
399 | 
400 | 
401 | def get_such_logs(path_to_logs, part_name_list):
402 |     searched_list = []
403 |     for searched_file in os.listdir(path_to_logs):
404 |         if all(x in searched_file for x in part_name_list):
405 |             searched_list.append(searched_file)
406 |     return searched_list
407 | 


--------------------------------------------------------------------------------
/features_extraction/MainBro.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.insert(0, os.environ['HOME'] + '/BotnetDetectionThesis/')
 4 | 
 5 | from time import time
 6 | import datetime
 7 | from ComputeFeatures import ComputeFeatures
 8 | import config as c
 9 | import os
10 | from DNSFeatures import DNSFeatures
11 | from logger import get_logger
12 | 
13 | logger = get_logger('debug')
14 | 
15 | def main():
16 |     # Start to count the time.
17 |     start_time = time()
18 | 
19 |     # Create new instance.
20 |     extract_features = ComputeFeatures()
21 | 
22 |     print " << Loading top alexa: "
23 |     DNSFeatures.load_all_top_alexa()
24 |     print "     << Loaded top alexa: "
25 | 
26 |     # Go throw all subset in dataset.
27 |     index = 1
28 |     for sub_set in os.listdir(c.datasets_folder):
29 |         if sub_set.startswith("."):
30 |             continue
31 |         logger.info("--------------------------------------------------------")
32 |         logger.info("-------- #{} {} extraction".format(index, sub_set))
33 |         logger.info("--------------------------------------------------------")
34 | 
35 |         extract_features.extraction_manager(c.datasets_folder + sub_set + '/bro/')
36 |         index += 1
37 | 
38 |     # Add certificate to connections that does not contain any certificate.
39 |     extract_features.add_cert_to_non_cert_conn()
40 | 
41 |     # Compute features and save them.
42 |     #extract_features.create_dataset_dns()
43 |     logger.info("computing features...")
44 |     extract_features.create_balanced_dataset()
45 | 
46 |     # Print final statistic
47 |     extract_features.print_statistic()
48 |     # Extract_features.compute_features()
49 |     extract_features.save_dataset_information()
50 | 
51 |     total_time = datetime.timedelta(seconds=time() - start_time)
52 |     print "<<< All dataset successfully finished in aproximate time: " + str(total_time)
53 | 
54 | 
55 | if __name__ == '__main__':
56 |    main()
57 | 


--------------------------------------------------------------------------------
/features_extraction/__init.py__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lminy/BotnetDetectionThesis/5a54541229a6d7255f0eebe65aaf8b1c35b9be04/features_extraction/__init.py__.py


--------------------------------------------------------------------------------
/features_extraction/top_level_domain:
--------------------------------------------------------------------------------
   1 | # Version 2017090700, Last Updated Thu Sep  7 07:07:01 2017 UTC
   2 | AAA
   3 | AARP
   4 | ABARTH
   5 | ABB
   6 | ABBOTT
   7 | ABBVIE
   8 | ABC
   9 | ABLE
  10 | ABOGADO
  11 | ABUDHABI
  12 | AC
  13 | ACADEMY
  14 | ACCENTURE
  15 | ACCOUNTANT
  16 | ACCOUNTANTS
  17 | ACO
  18 | ACTIVE
  19 | ACTOR
  20 | AD
  21 | ADAC
  22 | ADS
  23 | ADULT
  24 | AE
  25 | AEG
  26 | AERO
  27 | AETNA
  28 | AF
  29 | AFAMILYCOMPANY
  30 | AFL
  31 | AFRICA
  32 | AG
  33 | AGAKHAN
  34 | AGENCY
  35 | AI
  36 | AIG
  37 | AIGO
  38 | AIRBUS
  39 | AIRFORCE
  40 | AIRTEL
  41 | AKDN
  42 | AL
  43 | ALFAROMEO
  44 | ALIBABA
  45 | ALIPAY
  46 | ALLFINANZ
  47 | ALLSTATE
  48 | ALLY
  49 | ALSACE
  50 | ALSTOM
  51 | AM
  52 | AMERICANEXPRESS
  53 | AMERICANFAMILY
  54 | AMEX
  55 | AMFAM
  56 | AMICA
  57 | AMSTERDAM
  58 | ANALYTICS
  59 | ANDROID
  60 | ANQUAN
  61 | ANZ
  62 | AO
  63 | AOL
  64 | APARTMENTS
  65 | APP
  66 | APPLE
  67 | AQ
  68 | AQUARELLE
  69 | AR
  70 | ARAB
  71 | ARAMCO
  72 | ARCHI
  73 | ARMY
  74 | ARPA
  75 | ART
  76 | ARTE
  77 | AS
  78 | ASDA
  79 | ASIA
  80 | ASSOCIATES
  81 | AT
  82 | ATHLETA
  83 | ATTORNEY
  84 | AU
  85 | AUCTION
  86 | AUDI
  87 | AUDIBLE
  88 | AUDIO
  89 | AUSPOST
  90 | AUTHOR
  91 | AUTO
  92 | AUTOS
  93 | AVIANCA
  94 | AW
  95 | AWS
  96 | AX
  97 | AXA
  98 | AZ
  99 | AZURE
 100 | BA
 101 | BABY
 102 | BAIDU
 103 | BANAMEX
 104 | BANANAREPUBLIC
 105 | BAND
 106 | BANK
 107 | BAR
 108 | BARCELONA
 109 | BARCLAYCARD
 110 | BARCLAYS
 111 | BAREFOOT
 112 | BARGAINS
 113 | BASEBALL
 114 | BASKETBALL
 115 | BAUHAUS
 116 | BAYERN
 117 | BB
 118 | BBC
 119 | BBT
 120 | BBVA
 121 | BCG
 122 | BCN
 123 | BD
 124 | BE
 125 | BEATS
 126 | BEAUTY
 127 | BEER
 128 | BENTLEY
 129 | BERLIN
 130 | BEST
 131 | BESTBUY
 132 | BET
 133 | BF
 134 | BG
 135 | BH
 136 | BHARTI
 137 | BI
 138 | BIBLE
 139 | BID
 140 | BIKE
 141 | BING
 142 | BINGO
 143 | BIO
 144 | BIZ
 145 | BJ
 146 | BLACK
 147 | BLACKFRIDAY
 148 | BLANCO
 149 | BLOCKBUSTER
 150 | BLOG
 151 | BLOOMBERG
 152 | BLUE
 153 | BM
 154 | BMS
 155 | BMW
 156 | BN
 157 | BNL
 158 | BNPPARIBAS
 159 | BO
 160 | BOATS
 161 | BOEHRINGER
 162 | BOFA
 163 | BOM
 164 | BOND
 165 | BOO
 166 | BOOK
 167 | BOOKING
 168 | BOOTS
 169 | BOSCH
 170 | BOSTIK
 171 | BOSTON
 172 | BOT
 173 | BOUTIQUE
 174 | BOX
 175 | BR
 176 | BRADESCO
 177 | BRIDGESTONE
 178 | BROADWAY
 179 | BROKER
 180 | BROTHER
 181 | BRUSSELS
 182 | BS
 183 | BT
 184 | BUDAPEST
 185 | BUGATTI
 186 | BUILD
 187 | BUILDERS
 188 | BUSINESS
 189 | BUY
 190 | BUZZ
 191 | BV
 192 | BW
 193 | BY
 194 | BZ
 195 | BZH
 196 | CA
 197 | CAB
 198 | CAFE
 199 | CAL
 200 | CALL
 201 | CALVINKLEIN
 202 | CAM
 203 | CAMERA
 204 | CAMP
 205 | CANCERRESEARCH
 206 | CANON
 207 | CAPETOWN
 208 | CAPITAL
 209 | CAPITALONE
 210 | CAR
 211 | CARAVAN
 212 | CARDS
 213 | CARE
 214 | CAREER
 215 | CAREERS
 216 | CARS
 217 | CARTIER
 218 | CASA
 219 | CASE
 220 | CASEIH
 221 | CASH
 222 | CASINO
 223 | CAT
 224 | CATERING
 225 | CATHOLIC
 226 | CBA
 227 | CBN
 228 | CBRE
 229 | CBS
 230 | CC
 231 | CD
 232 | CEB
 233 | CENTER
 234 | CEO
 235 | CERN
 236 | CF
 237 | CFA
 238 | CFD
 239 | CG
 240 | CH
 241 | CHANEL
 242 | CHANNEL
 243 | CHASE
 244 | CHAT
 245 | CHEAP
 246 | CHINTAI
 247 | CHLOE
 248 | CHRISTMAS
 249 | CHROME
 250 | CHRYSLER
 251 | CHURCH
 252 | CI
 253 | CIPRIANI
 254 | CIRCLE
 255 | CISCO
 256 | CITADEL
 257 | CITI
 258 | CITIC
 259 | CITY
 260 | CITYEATS
 261 | CK
 262 | CL
 263 | CLAIMS
 264 | CLEANING
 265 | CLICK
 266 | CLINIC
 267 | CLINIQUE
 268 | CLOTHING
 269 | CLOUD
 270 | CLUB
 271 | CLUBMED
 272 | CM
 273 | CN
 274 | CO
 275 | COACH
 276 | CODES
 277 | COFFEE
 278 | COLLEGE
 279 | COLOGNE
 280 | COM
 281 | COMCAST
 282 | COMMBANK
 283 | COMMUNITY
 284 | COMPANY
 285 | COMPARE
 286 | COMPUTER
 287 | COMSEC
 288 | CONDOS
 289 | CONSTRUCTION
 290 | CONSULTING
 291 | CONTACT
 292 | CONTRACTORS
 293 | COOKING
 294 | COOKINGCHANNEL
 295 | COOL
 296 | COOP
 297 | CORSICA
 298 | COUNTRY
 299 | COUPON
 300 | COUPONS
 301 | COURSES
 302 | CR
 303 | CREDIT
 304 | CREDITCARD
 305 | CREDITUNION
 306 | CRICKET
 307 | CROWN
 308 | CRS
 309 | CRUISE
 310 | CRUISES
 311 | CSC
 312 | CU
 313 | CUISINELLA
 314 | CV
 315 | CW
 316 | CX
 317 | CY
 318 | CYMRU
 319 | CYOU
 320 | CZ
 321 | DABUR
 322 | DAD
 323 | DANCE
 324 | DATA
 325 | DATE
 326 | DATING
 327 | DATSUN
 328 | DAY
 329 | DCLK
 330 | DDS
 331 | DE
 332 | DEAL
 333 | DEALER
 334 | DEALS
 335 | DEGREE
 336 | DELIVERY
 337 | DELL
 338 | DELOITTE
 339 | DELTA
 340 | DEMOCRAT
 341 | DENTAL
 342 | DENTIST
 343 | DESI
 344 | DESIGN
 345 | DEV
 346 | DHL
 347 | DIAMONDS
 348 | DIET
 349 | DIGITAL
 350 | DIRECT
 351 | DIRECTORY
 352 | DISCOUNT
 353 | DISCOVER
 354 | DISH
 355 | DIY
 356 | DJ
 357 | DK
 358 | DM
 359 | DNP
 360 | DO
 361 | DOCS
 362 | DOCTOR
 363 | DODGE
 364 | DOG
 365 | DOHA
 366 | DOMAINS
 367 | DOT
 368 | DOWNLOAD
 369 | DRIVE
 370 | DTV
 371 | DUBAI
 372 | DUCK
 373 | DUNLOP
 374 | DUNS
 375 | DUPONT
 376 | DURBAN
 377 | DVAG
 378 | DVR
 379 | DZ
 380 | EARTH
 381 | EAT
 382 | EC
 383 | ECO
 384 | EDEKA
 385 | EDU
 386 | EDUCATION
 387 | EE
 388 | EG
 389 | EMAIL
 390 | EMERCK
 391 | ENERGY
 392 | ENGINEER
 393 | ENGINEERING
 394 | ENTERPRISES
 395 | EPOST
 396 | EPSON
 397 | EQUIPMENT
 398 | ER
 399 | ERICSSON
 400 | ERNI
 401 | ES
 402 | ESQ
 403 | ESTATE
 404 | ESURANCE
 405 | ET
 406 | ETISALAT
 407 | EU
 408 | EUROVISION
 409 | EUS
 410 | EVENTS
 411 | EVERBANK
 412 | EXCHANGE
 413 | EXPERT
 414 | EXPOSED
 415 | EXPRESS
 416 | EXTRASPACE
 417 | FAGE
 418 | FAIL
 419 | FAIRWINDS
 420 | FAITH
 421 | FAMILY
 422 | FAN
 423 | FANS
 424 | FARM
 425 | FARMERS
 426 | FASHION
 427 | FAST
 428 | FEDEX
 429 | FEEDBACK
 430 | FERRARI
 431 | FERRERO
 432 | FI
 433 | FIAT
 434 | FIDELITY
 435 | FIDO
 436 | FILM
 437 | FINAL
 438 | FINANCE
 439 | FINANCIAL
 440 | FIRE
 441 | FIRESTONE
 442 | FIRMDALE
 443 | FISH
 444 | FISHING
 445 | FIT
 446 | FITNESS
 447 | FJ
 448 | FK
 449 | FLICKR
 450 | FLIGHTS
 451 | FLIR
 452 | FLORIST
 453 | FLOWERS
 454 | FLY
 455 | FM
 456 | FO
 457 | FOO
 458 | FOOD
 459 | FOODNETWORK
 460 | FOOTBALL
 461 | FORD
 462 | FOREX
 463 | FORSALE
 464 | FORUM
 465 | FOUNDATION
 466 | FOX
 467 | FR
 468 | FREE
 469 | FRESENIUS
 470 | FRL
 471 | FROGANS
 472 | FRONTDOOR
 473 | FRONTIER
 474 | FTR
 475 | FUJITSU
 476 | FUJIXEROX
 477 | FUN
 478 | FUND
 479 | FURNITURE
 480 | FUTBOL
 481 | FYI
 482 | GA
 483 | GAL
 484 | GALLERY
 485 | GALLO
 486 | GALLUP
 487 | GAME
 488 | GAMES
 489 | GAP
 490 | GARDEN
 491 | GB
 492 | GBIZ
 493 | GD
 494 | GDN
 495 | GE
 496 | GEA
 497 | GENT
 498 | GENTING
 499 | GEORGE
 500 | GF
 501 | GG
 502 | GGEE
 503 | GH
 504 | GI
 505 | GIFT
 506 | GIFTS
 507 | GIVES
 508 | GIVING
 509 | GL
 510 | GLADE
 511 | GLASS
 512 | GLE
 513 | GLOBAL
 514 | GLOBO
 515 | GM
 516 | GMAIL
 517 | GMBH
 518 | GMO
 519 | GMX
 520 | GN
 521 | GODADDY
 522 | GOLD
 523 | GOLDPOINT
 524 | GOLF
 525 | GOO
 526 | GOODHANDS
 527 | GOODYEAR
 528 | GOOG
 529 | GOOGLE
 530 | GOP
 531 | GOT
 532 | GOV
 533 | GP
 534 | GQ
 535 | GR
 536 | GRAINGER
 537 | GRAPHICS
 538 | GRATIS
 539 | GREEN
 540 | GRIPE
 541 | GROCERY
 542 | GROUP
 543 | GS
 544 | GT
 545 | GU
 546 | GUARDIAN
 547 | GUCCI
 548 | GUGE
 549 | GUIDE
 550 | GUITARS
 551 | GURU
 552 | GW
 553 | GY
 554 | HAIR
 555 | HAMBURG
 556 | HANGOUT
 557 | HAUS
 558 | HBO
 559 | HDFC
 560 | HDFCBANK
 561 | HEALTH
 562 | HEALTHCARE
 563 | HELP
 564 | HELSINKI
 565 | HERE
 566 | HERMES
 567 | HGTV
 568 | HIPHOP
 569 | HISAMITSU
 570 | HITACHI
 571 | HIV
 572 | HK
 573 | HKT
 574 | HM
 575 | HN
 576 | HOCKEY
 577 | HOLDINGS
 578 | HOLIDAY
 579 | HOMEDEPOT
 580 | HOMEGOODS
 581 | HOMES
 582 | HOMESENSE
 583 | HONDA
 584 | HONEYWELL
 585 | HORSE
 586 | HOSPITAL
 587 | HOST
 588 | HOSTING
 589 | HOT
 590 | HOTELES
 591 | HOTELS
 592 | HOTMAIL
 593 | HOUSE
 594 | HOW
 595 | HR
 596 | HSBC
 597 | HT
 598 | HTC
 599 | HU
 600 | HUGHES
 601 | HYATT
 602 | HYUNDAI
 603 | IBM
 604 | ICBC
 605 | ICE
 606 | ICU
 607 | ID
 608 | IE
 609 | IEEE
 610 | IFM
 611 | IKANO
 612 | IL
 613 | IM
 614 | IMAMAT
 615 | IMDB
 616 | IMMO
 617 | IMMOBILIEN
 618 | IN
 619 | INDUSTRIES
 620 | INFINITI
 621 | INFO
 622 | ING
 623 | INK
 624 | INSTITUTE
 625 | INSURANCE
 626 | INSURE
 627 | INT
 628 | INTEL
 629 | INTERNATIONAL
 630 | INTUIT
 631 | INVESTMENTS
 632 | IO
 633 | IPIRANGA
 634 | IQ
 635 | IR
 636 | IRISH
 637 | IS
 638 | ISELECT
 639 | ISMAILI
 640 | IST
 641 | ISTANBUL
 642 | IT
 643 | ITAU
 644 | ITV
 645 | IVECO
 646 | IWC
 647 | JAGUAR
 648 | JAVA
 649 | JCB
 650 | JCP
 651 | JE
 652 | JEEP
 653 | JETZT
 654 | JEWELRY
 655 | JIO
 656 | JLC
 657 | JLL
 658 | JM
 659 | JMP
 660 | JNJ
 661 | JO
 662 | JOBS
 663 | JOBURG
 664 | JOT
 665 | JOY
 666 | JP
 667 | JPMORGAN
 668 | JPRS
 669 | JUEGOS
 670 | JUNIPER
 671 | KAUFEN
 672 | KDDI
 673 | KE
 674 | KERRYHOTELS
 675 | KERRYLOGISTICS
 676 | KERRYPROPERTIES
 677 | KFH
 678 | KG
 679 | KH
 680 | KI
 681 | KIA
 682 | KIM
 683 | KINDER
 684 | KINDLE
 685 | KITCHEN
 686 | KIWI
 687 | KM
 688 | KN
 689 | KOELN
 690 | KOMATSU
 691 | KOSHER
 692 | KP
 693 | KPMG
 694 | KPN
 695 | KR
 696 | KRD
 697 | KRED
 698 | KUOKGROUP
 699 | KW
 700 | KY
 701 | KYOTO
 702 | KZ
 703 | LA
 704 | LACAIXA
 705 | LADBROKES
 706 | LAMBORGHINI
 707 | LAMER
 708 | LANCASTER
 709 | LANCIA
 710 | LANCOME
 711 | LAND
 712 | LANDROVER
 713 | LANXESS
 714 | LASALLE
 715 | LAT
 716 | LATINO
 717 | LATROBE
 718 | LAW
 719 | LAWYER
 720 | LB
 721 | LC
 722 | LDS
 723 | LEASE
 724 | LECLERC
 725 | LEFRAK
 726 | LEGAL
 727 | LEGO
 728 | LEXUS
 729 | LGBT
 730 | LI
 731 | LIAISON
 732 | LIDL
 733 | LIFE
 734 | LIFEINSURANCE
 735 | LIFESTYLE
 736 | LIGHTING
 737 | LIKE
 738 | LILLY
 739 | LIMITED
 740 | LIMO
 741 | LINCOLN
 742 | LINDE
 743 | LINK
 744 | LIPSY
 745 | LIVE
 746 | LIVING
 747 | LIXIL
 748 | LK
 749 | LOAN
 750 | LOANS
 751 | LOCKER
 752 | LOCUS
 753 | LOFT
 754 | LOL
 755 | LONDON
 756 | LOTTE
 757 | LOTTO
 758 | LOVE
 759 | LPL
 760 | LPLFINANCIAL
 761 | LR
 762 | LS
 763 | LT
 764 | LTD
 765 | LTDA
 766 | LU
 767 | LUNDBECK
 768 | LUPIN
 769 | LUXE
 770 | LUXURY
 771 | LV
 772 | LY
 773 | MA
 774 | MACYS
 775 | MADRID
 776 | MAIF
 777 | MAISON
 778 | MAKEUP
 779 | MAN
 780 | MANAGEMENT
 781 | MANGO
 782 | MAP
 783 | MARKET
 784 | MARKETING
 785 | MARKETS
 786 | MARRIOTT
 787 | MARSHALLS
 788 | MASERATI
 789 | MATTEL
 790 | MBA
 791 | MC
 792 | MCKINSEY
 793 | MD
 794 | ME
 795 | MED
 796 | MEDIA
 797 | MEET
 798 | MELBOURNE
 799 | MEME
 800 | MEMORIAL
 801 | MEN
 802 | MENU
 803 | MEO
 804 | MERCKMSD
 805 | METLIFE
 806 | MG
 807 | MH
 808 | MIAMI
 809 | MICROSOFT
 810 | MIL
 811 | MINI
 812 | MINT
 813 | MIT
 814 | MITSUBISHI
 815 | MK
 816 | ML
 817 | MLB
 818 | MLS
 819 | MM
 820 | MMA
 821 | MN
 822 | MO
 823 | MOBI
 824 | MOBILE
 825 | MOBILY
 826 | MODA
 827 | MOE
 828 | MOI
 829 | MOM
 830 | MONASH
 831 | MONEY
 832 | MONSTER
 833 | MOPAR
 834 | MORMON
 835 | MORTGAGE
 836 | MOSCOW
 837 | MOTO
 838 | MOTORCYCLES
 839 | MOV
 840 | MOVIE
 841 | MOVISTAR
 842 | MP
 843 | MQ
 844 | MR
 845 | MS
 846 | MSD
 847 | MT
 848 | MTN
 849 | MTR
 850 | MU
 851 | MUSEUM
 852 | MUTUAL
 853 | MV
 854 | MW
 855 | MX
 856 | MY
 857 | MZ
 858 | NA
 859 | NAB
 860 | NADEX
 861 | NAGOYA
 862 | NAME
 863 | NATIONWIDE
 864 | NATURA
 865 | NAVY
 866 | NBA
 867 | NC
 868 | NE
 869 | NEC
 870 | NET
 871 | NETBANK
 872 | NETFLIX
 873 | NETWORK
 874 | NEUSTAR
 875 | NEW
 876 | NEWHOLLAND
 877 | NEWS
 878 | NEXT
 879 | NEXTDIRECT
 880 | NEXUS
 881 | NF
 882 | NFL
 883 | NG
 884 | NGO
 885 | NHK
 886 | NI
 887 | NICO
 888 | NIKE
 889 | NIKON
 890 | NINJA
 891 | NISSAN
 892 | NISSAY
 893 | NL
 894 | NO
 895 | NOKIA
 896 | NORTHWESTERNMUTUAL
 897 | NORTON
 898 | NOW
 899 | NOWRUZ
 900 | NOWTV
 901 | NP
 902 | NR
 903 | NRA
 904 | NRW
 905 | NTT
 906 | NU
 907 | NYC
 908 | NZ
 909 | OBI
 910 | OBSERVER
 911 | OFF
 912 | OFFICE
 913 | OKINAWA
 914 | OLAYAN
 915 | OLAYANGROUP
 916 | OLDNAVY
 917 | OLLO
 918 | OM
 919 | OMEGA
 920 | ONE
 921 | ONG
 922 | ONL
 923 | ONLINE
 924 | ONYOURSIDE
 925 | OOO
 926 | OPEN
 927 | ORACLE
 928 | ORANGE
 929 | ORG
 930 | ORGANIC
 931 | ORIGINS
 932 | OSAKA
 933 | OTSUKA
 934 | OTT
 935 | OVH
 936 | PA
 937 | PAGE
 938 | PAMPEREDCHEF
 939 | PANASONIC
 940 | PANERAI
 941 | PARIS
 942 | PARS
 943 | PARTNERS
 944 | PARTS
 945 | PARTY
 946 | PASSAGENS
 947 | PAY
 948 | PCCW
 949 | PE
 950 | PET
 951 | PF
 952 | PFIZER
 953 | PG
 954 | PH
 955 | PHARMACY
 956 | PHD
 957 | PHILIPS
 958 | PHONE
 959 | PHOTO
 960 | PHOTOGRAPHY
 961 | PHOTOS
 962 | PHYSIO
 963 | PIAGET
 964 | PICS
 965 | PICTET
 966 | PICTURES
 967 | PID
 968 | PIN
 969 | PING
 970 | PINK
 971 | PIONEER
 972 | PIZZA
 973 | PK
 974 | PL
 975 | PLACE
 976 | PLAY
 977 | PLAYSTATION
 978 | PLUMBING
 979 | PLUS
 980 | PM
 981 | PN
 982 | PNC
 983 | POHL
 984 | POKER
 985 | POLITIE
 986 | PORN
 987 | POST
 988 | PR
 989 | PRAMERICA
 990 | PRAXI
 991 | PRESS
 992 | PRIME
 993 | PRO
 994 | PROD
 995 | PRODUCTIONS
 996 | PROF
 997 | PROGRESSIVE
 998 | PROMO
 999 | PROPERTIES
1000 | PROPERTY
1001 | PROTECTION
1002 | PRU
1003 | PRUDENTIAL
1004 | PS
1005 | PT
1006 | PUB
1007 | PW
1008 | PWC
1009 | PY
1010 | QA
1011 | QPON
1012 | QUEBEC
1013 | QUEST
1014 | QVC
1015 | RACING
1016 | RADIO
1017 | RAID
1018 | RE
1019 | READ
1020 | REALESTATE
1021 | REALTOR
1022 | REALTY
1023 | RECIPES
1024 | RED
1025 | REDSTONE
1026 | REDUMBRELLA
1027 | REHAB
1028 | REISE
1029 | REISEN
1030 | REIT
1031 | RELIANCE
1032 | REN
1033 | RENT
1034 | RENTALS
1035 | REPAIR
1036 | REPORT
1037 | REPUBLICAN
1038 | REST
1039 | RESTAURANT
1040 | REVIEW
1041 | REVIEWS
1042 | REXROTH
1043 | RICH
1044 | RICHARDLI
1045 | RICOH
1046 | RIGHTATHOME
1047 | RIL
1048 | RIO
1049 | RIP
1050 | RMIT
1051 | RO
1052 | ROCHER
1053 | ROCKS
1054 | RODEO
1055 | ROGERS
1056 | ROOM
1057 | RS
1058 | RSVP
1059 | RU
1060 | RUGBY
1061 | RUHR
1062 | RUN
1063 | RW
1064 | RWE
1065 | RYUKYU
1066 | SA
1067 | SAARLAND
1068 | SAFE
1069 | SAFETY
1070 | SAKURA
1071 | SALE
1072 | SALON
1073 | SAMSCLUB
1074 | SAMSUNG
1075 | SANDVIK
1076 | SANDVIKCOROMANT
1077 | SANOFI
1078 | SAP
1079 | SAPO
1080 | SARL
1081 | SAS
1082 | SAVE
1083 | SAXO
1084 | SB
1085 | SBI
1086 | SBS
1087 | SC
1088 | SCA
1089 | SCB
1090 | SCHAEFFLER
1091 | SCHMIDT
1092 | SCHOLARSHIPS
1093 | SCHOOL
1094 | SCHULE
1095 | SCHWARZ
1096 | SCIENCE
1097 | SCJOHNSON
1098 | SCOR
1099 | SCOT
1100 | SD
1101 | SE
1102 | SEARCH
1103 | SEAT
1104 | SECURE
1105 | SECURITY
1106 | SEEK
1107 | SELECT
1108 | SENER
1109 | SERVICES
1110 | SES
1111 | SEVEN
1112 | SEW
1113 | SEX
1114 | SEXY
1115 | SFR
1116 | SG
1117 | SH
1118 | SHANGRILA
1119 | SHARP
1120 | SHAW
1121 | SHELL
1122 | SHIA
1123 | SHIKSHA
1124 | SHOES
1125 | SHOP
1126 | SHOPPING
1127 | SHOUJI
1128 | SHOW
1129 | SHOWTIME
1130 | SHRIRAM
1131 | SI
1132 | SILK
1133 | SINA
1134 | SINGLES
1135 | SITE
1136 | SJ
1137 | SK
1138 | SKI
1139 | SKIN
1140 | SKY
1141 | SKYPE
1142 | SL
1143 | SLING
1144 | SM
1145 | SMART
1146 | SMILE
1147 | SN
1148 | SNCF
1149 | SO
1150 | SOCCER
1151 | SOCIAL
1152 | SOFTBANK
1153 | SOFTWARE
1154 | SOHU
1155 | SOLAR
1156 | SOLUTIONS
1157 | SONG
1158 | SONY
1159 | SOY
1160 | SPACE
1161 | SPIEGEL
1162 | SPOT
1163 | SPREADBETTING
1164 | SR
1165 | SRL
1166 | SRT
1167 | ST
1168 | STADA
1169 | STAPLES
1170 | STAR
1171 | STARHUB
1172 | STATEBANK
1173 | STATEFARM
1174 | STATOIL
1175 | STC
1176 | STCGROUP
1177 | STOCKHOLM
1178 | STORAGE
1179 | STORE
1180 | STREAM
1181 | STUDIO
1182 | STUDY
1183 | STYLE
1184 | SU
1185 | SUCKS
1186 | SUPPLIES
1187 | SUPPLY
1188 | SUPPORT
1189 | SURF
1190 | SURGERY
1191 | SUZUKI
1192 | SV
1193 | SWATCH
1194 | SWIFTCOVER
1195 | SWISS
1196 | SX
1197 | SY
1198 | SYDNEY
1199 | SYMANTEC
1200 | SYSTEMS
1201 | SZ
1202 | TAB
1203 | TAIPEI
1204 | TALK
1205 | TAOBAO
1206 | TARGET
1207 | TATAMOTORS
1208 | TATAR
1209 | TATTOO
1210 | TAX
1211 | TAXI
1212 | TC
1213 | TCI
1214 | TD
1215 | TDK
1216 | TEAM
1217 | TECH
1218 | TECHNOLOGY
1219 | TEL
1220 | TELECITY
1221 | TELEFONICA
1222 | TEMASEK
1223 | TENNIS
1224 | TEVA
1225 | TF
1226 | TG
1227 | TH
1228 | THD
1229 | THEATER
1230 | THEATRE
1231 | TIAA
1232 | TICKETS
1233 | TIENDA
1234 | TIFFANY
1235 | TIPS
1236 | TIRES
1237 | TIROL
1238 | TJ
1239 | TJMAXX
1240 | TJX
1241 | TK
1242 | TKMAXX
1243 | TL
1244 | TM
1245 | TMALL
1246 | TN
1247 | TO
1248 | TODAY
1249 | TOKYO
1250 | TOOLS
1251 | TOP
1252 | TORAY
1253 | TOSHIBA
1254 | TOTAL
1255 | TOURS
1256 | TOWN
1257 | TOYOTA
1258 | TOYS
1259 | TR
1260 | TRADE
1261 | TRADING
1262 | TRAINING
1263 | TRAVEL
1264 | TRAVELCHANNEL
1265 | TRAVELERS
1266 | TRAVELERSINSURANCE
1267 | TRUST
1268 | TRV
1269 | TT
1270 | TUBE
1271 | TUI
1272 | TUNES
1273 | TUSHU
1274 | TV
1275 | TVS
1276 | TW
1277 | TZ
1278 | UA
1279 | UBANK
1280 | UBS
1281 | UCONNECT
1282 | UG
1283 | UK
1284 | UNICOM
1285 | UNIVERSITY
1286 | UNO
1287 | UOL
1288 | UPS
1289 | US
1290 | UY
1291 | UZ
1292 | VA
1293 | VACATIONS
1294 | VANA
1295 | VANGUARD
1296 | VC
1297 | VE
1298 | VEGAS
1299 | VENTURES
1300 | VERISIGN
1301 | VERSICHERUNG
1302 | VET
1303 | VG
1304 | VI
1305 | VIAJES
1306 | VIDEO
1307 | VIG
1308 | VIKING
1309 | VILLAS
1310 | VIN
1311 | VIP
1312 | VIRGIN
1313 | VISA
1314 | VISION
1315 | VISTA
1316 | VISTAPRINT
1317 | VIVA
1318 | VIVO
1319 | VLAANDEREN
1320 | VN
1321 | VODKA
1322 | VOLKSWAGEN
1323 | VOLVO
1324 | VOTE
1325 | VOTING
1326 | VOTO
1327 | VOYAGE
1328 | VU
1329 | VUELOS
1330 | WALES
1331 | WALMART
1332 | WALTER
1333 | WANG
1334 | WANGGOU
1335 | WARMAN
1336 | WATCH
1337 | WATCHES
1338 | WEATHER
1339 | WEATHERCHANNEL
1340 | WEBCAM
1341 | WEBER
1342 | WEBSITE
1343 | WED
1344 | WEDDING
1345 | WEIBO
1346 | WEIR
1347 | WF
1348 | WHOSWHO
1349 | WIEN
1350 | WIKI
1351 | WILLIAMHILL
1352 | WIN
1353 | WINDOWS
1354 | WINE
1355 | WINNERS
1356 | WME
1357 | WOLTERSKLUWER
1358 | WOODSIDE
1359 | WORK
1360 | WORKS
1361 | WORLD
1362 | WOW
1363 | WS
1364 | WTC
1365 | WTF
1366 | XBOX
1367 | XEROX
1368 | XFINITY
1369 | XIHUAN
1370 | XIN
1371 | XN--11B4C3D
1372 | XN--1CK2E1B
1373 | XN--1QQW23A
1374 | XN--2SCRJ9C
1375 | XN--30RR7Y
1376 | XN--3BST00M
1377 | XN--3DS443G
1378 | XN--3E0B707E
1379 | XN--3HCRJ9C
1380 | XN--3OQ18VL8PN36A
1381 | XN--3PXU8K
1382 | XN--42C2D9A
1383 | XN--45BR5CYL
1384 | XN--45BRJ9C
1385 | XN--45Q11C
1386 | XN--4GBRIM
1387 | XN--54B7FTA0CC
1388 | XN--55QW42G
1389 | XN--55QX5D
1390 | XN--5SU34J936BGSG
1391 | XN--5TZM5G
1392 | XN--6FRZ82G
1393 | XN--6QQ986B3XL
1394 | XN--80ADXHKS
1395 | XN--80AO21A
1396 | XN--80AQECDR1A
1397 | XN--80ASEHDB
1398 | XN--80ASWG
1399 | XN--8Y0A063A
1400 | XN--90A3AC
1401 | XN--90AE
1402 | XN--90AIS
1403 | XN--9DBQ2A
1404 | XN--9ET52U
1405 | XN--9KRT00A
1406 | XN--B4W605FERD
1407 | XN--BCK1B9A5DRE4C
1408 | XN--C1AVG
1409 | XN--C2BR7G
1410 | XN--CCK2B3B
1411 | XN--CG4BKI
1412 | XN--CLCHC0EA0B2G2A9GCD
1413 | XN--CZR694B
1414 | XN--CZRS0T
1415 | XN--CZRU2D
1416 | XN--D1ACJ3B
1417 | XN--D1ALF
1418 | XN--E1A4C
1419 | XN--ECKVDTC9D
1420 | XN--EFVY88H
1421 | XN--ESTV75G
1422 | XN--FCT429K
1423 | XN--FHBEI
1424 | XN--FIQ228C5HS
1425 | XN--FIQ64B
1426 | XN--FIQS8S
1427 | XN--FIQZ9S
1428 | XN--FJQ720A
1429 | XN--FLW351E
1430 | XN--FPCRJ9C3D
1431 | XN--FZC2C9E2C
1432 | XN--FZYS8D69UVGM
1433 | XN--G2XX48C
1434 | XN--GCKR3F0F
1435 | XN--GECRJ9C
1436 | XN--GK3AT1E
1437 | XN--H2BREG3EVE
1438 | XN--H2BRJ9C
1439 | XN--H2BRJ9C8C
1440 | XN--HXT814E
1441 | XN--I1B6B1A6A2E
1442 | XN--IMR513N
1443 | XN--IO0A7I
1444 | XN--J1AEF
1445 | XN--J1AMH
1446 | XN--J6W193G
1447 | XN--JLQ61U9W7B
1448 | XN--JVR189M
1449 | XN--KCRX77D1X4A
1450 | XN--KPRW13D
1451 | XN--KPRY57D
1452 | XN--KPU716F
1453 | XN--KPUT3I
1454 | XN--L1ACC
1455 | XN--LGBBAT1AD8J
1456 | XN--MGB9AWBF
1457 | XN--MGBA3A3EJT
1458 | XN--MGBA3A4F16A
1459 | XN--MGBA7C0BBN0A
1460 | XN--MGBAAKC7DVF
1461 | XN--MGBAAM7A8H
1462 | XN--MGBAB2BD
1463 | XN--MGBAI9AZGQP6J
1464 | XN--MGBAYH7GPA
1465 | XN--MGBB9FBPOB
1466 | XN--MGBBH1A
1467 | XN--MGBBH1A71E
1468 | XN--MGBC0A9AZCG
1469 | XN--MGBCA7DZDO
1470 | XN--MGBERP4A5D4AR
1471 | XN--MGBGU82A
1472 | XN--MGBI4ECEXP
1473 | XN--MGBPL2FH
1474 | XN--MGBT3DHD
1475 | XN--MGBTX2B
1476 | XN--MGBX4CD0AB
1477 | XN--MIX891F
1478 | XN--MK1BU44C
1479 | XN--MXTQ1M
1480 | XN--NGBC5AZD
1481 | XN--NGBE9E0A
1482 | XN--NGBRX
1483 | XN--NODE
1484 | XN--NQV7F
1485 | XN--NQV7FS00EMA
1486 | XN--NYQY26A
1487 | XN--O3CW4H
1488 | XN--OGBPF8FL
1489 | XN--P1ACF
1490 | XN--P1AI
1491 | XN--PBT977C
1492 | XN--PGBS0DH
1493 | XN--PSSY2U
1494 | XN--Q9JYB4C
1495 | XN--QCKA1PMC
1496 | XN--QXAM
1497 | XN--RHQV96G
1498 | XN--ROVU88B
1499 | XN--RVC1E0AM3E
1500 | XN--S9BRJ9C
1501 | XN--SES554G
1502 | XN--T60B56A
1503 | XN--TCKWE
1504 | XN--TIQ49XQYJ
1505 | XN--UNUP4Y
1506 | XN--VERMGENSBERATER-CTB
1507 | XN--VERMGENSBERATUNG-PWB
1508 | XN--VHQUV
1509 | XN--VUQ861B
1510 | XN--W4R85EL8FHU5DNRA
1511 | XN--W4RS40L
1512 | XN--WGBH1C
1513 | XN--WGBL6A
1514 | XN--XHQ521B
1515 | XN--XKC2AL3HYE2A
1516 | XN--XKC2DL3A5EE0H
1517 | XN--Y9A3AQ
1518 | XN--YFRO4I67O
1519 | XN--YGBI2AMMX
1520 | XN--ZFR164B
1521 | XPERIA
1522 | XXX
1523 | XYZ
1524 | YACHTS
1525 | YAHOO
1526 | YAMAXUN
1527 | YANDEX
1528 | YE
1529 | YODOBASHI
1530 | YOGA
1531 | YOKOHAMA
1532 | YOU
1533 | YOUTUBE
1534 | YT
1535 | YUN
1536 | ZA
1537 | ZAPPOS
1538 | ZARA
1539 | ZERO
1540 | ZIP
1541 | ZIPPO
1542 | ZM
1543 | ZONE
1544 | ZUERICH
1545 | ZW


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
 1 | # Singleton logger
 2 | 
 3 | import logging
 4 | import config as c
 5 | 
 6 | logger = None
 7 | 
 8 | 
 9 | def get_logger(loglevel, append=True):
10 |     global logger
11 |     if logger is not None:
12 |         return logger
13 | 
14 |     numeric_level = getattr(logging, loglevel.upper(), logging.INFO)
15 |     if not isinstance(numeric_level, int):
16 |         raise ValueError('Invalid log level: %s' % loglevel)
17 | 
18 |     import sys
19 |     import os
20 |     module_name = str(os.path.basename(sys.modules['__main__'].__file__)).split('.')[0]
21 | 
22 |     logger = logging.getLogger(module_name)
23 |     logger.setLevel(numeric_level)
24 |     # create file handler which logs even debug messages
25 |     fh = logging.FileHandler(c.logs_folder + module_name + '.log', mode=('a' if append else 'w'))
26 |     fh.setLevel(logging.DEBUG)
27 |     # create console handler with a higher log level
28 |     ch = logging.StreamHandler()
29 |     ch.setLevel(logging.DEBUG)
30 |     # create formatter and add it to the handlers
31 |     formatter = logging.Formatter('%(asctime)s\t%(name)s\t%(levelname)s\t\t%(message)s')
32 |     fh.setFormatter(formatter)
33 |     ch.setFormatter(formatter)
34 |     # add the handlers to the logger
35 |     logger.addHandler(fh)
36 |     logger.addHandler(ch)
37 |     logger.info("Logger created!")
38 |     return logger


--------------------------------------------------------------------------------
/machine_learning/Get_normalize_data.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import csv
  3 | import config as c
  4 | import numpy as np
  5 | """
  6 | featuresname_all = [
  7 |     "number_of_flows",
  8 |     "average_of_duration",
  9 |     "standard_deviation_duration",
 10 |     "percent_of_standard_deviation_duration",
 11 |     "total_size_of_flows_orig",
 12 |     "total_size_of_flows_resp",
 13 |     "ratio_of_sizes",
 14 |     "percent_of_established_states",
 15 |     "inbound_pckts",
 16 |     "outbound_pckts",
 17 |     "periodicity_average",
 18 |     "periodicity_standart_deviation",
 19 |     "ssl_ratio",
 20 |     "average_public_key",
 21 |     "tls_version_ratio",
 22 |     "average_of_certificate_length",
 23 |     "standart_deviation_cert_length",
 24 |     "is_valid_certificate_during_capture",
 25 |     "amount_diff_certificates",
 26 |     "number_of_domains_in_certificate",
 27 |     "get_certificate_ratio",
 28 |     "number_of_certificate_path",
 29 |     "x509_ssl_ratio",
 30 |     "SNI_ssl_ratio",
 31 |     "self_signed_ratio",
 32 |     "is_SNIs_in_SNA_dns",
 33 |     "SNI_equal_DstIP",
 34 |     "is_CNs_in_SNA_dns",
 35 |     "ratio_of_differ_SNI_in_ssl_log",
 36 |     "ratio_of_differ_subject_in_ssl_log",
 37 |     "ratio_of_differ_issuer_in_ssl_log",
 38 |     "ratio_of_differ_subject_in_cert",
 39 |     "ratio_of_differ_issuer_in_cert",
 40 |     "ratio_of_differ_sandns_in_cert",
 41 |     "ratio_of_same_subjects",
 42 |     "ratio_of_same_issuer",
 43 |     "ratio_is_same_CN_and_SNI",
 44 |     "average_certificate_exponent",
 45 |     "is_SNI_in_top_level_domain",
 46 |     "ratio_certificate_path_error",
 47 |     "ratio_missing_cert_in_cert_path",
 48 |     "in_alexa_top100",
 49 |     "in_alexa_top1k",
 50 |     "in_alexa_top10k",
 51 |     "in_alexa_top100k",
 52 |     "in_alexa_top1m",
 53 |     "not_in_alexa",
 54 |     "FQDN_length",
 55 |     "domain_name_length",
 56 |     "number_of_numerical_chars",
 57 |     "number_of_non_alphanumeric_chars",
 58 |     "number_unique_IP_addresses_in_response",
 59 |     "number_of_subdomains",
 60 |     "average_ttls",
 61 |     "std_ttls",
 62 |     "min_ttls",
 63 |     "max_ttls",
 64 |     "number_of_hyphens_in_fqdn",
 65 |     "length_of_longest_subdomain_name",
 66 |     "number_of_voyels_in_fqdn",
 67 |     "number_of_different_chars_in_fqdn",
 68 |     "number_of_consonants_in_fqdn",
 69 |     "shannon_entropy_2ld",
 70 |     "shannon_entropy_3ld"]
 71 | """
 72 | 
 73 | less_important_features = [
 74 |     "SNI_equal_DstIP",
 75 |     "ratio_of_differ_issuer_in_cert",
 76 |     "ratio_certificate_path_error",
 77 |     "ratio_missing_cert_in_cert_path",
 78 |     "standart_deviation_cert_length",
 79 |     "ratio_of_differ_subject_in_cert",
 80 |     "percent_of_established_states",
 81 |     "ratio_of_differ_issuer_in_ssl_log",
 82 |     "ratio_of_differ_subject_in_ssl_log",
 83 |     "is_SNI_in_top_level_domain",
 84 | 
 85 |     "ratio_of_same_issuer",
 86 |     "ratio_of_differ_sandns_in_cert",
 87 |     "in_alexa_top100k",
 88 |     "tls_version_ratio",
 89 |     "is_SNIs_in_SNA_dns",
 90 |     "in_alexa_top10k",
 91 |     "average_public_key",
 92 |     "number_of_hyphens_in_fqdn",
 93 |     "ratio_of_same_subjects",
 94 |     "average_certificate_exponent",
 95 | 
 96 |     "in_alexa_top1k",
 97 |     "is_CNs_in_SNA_dns",
 98 |     "amount_diff_certificates",
 99 |     "number_of_voyels_in_fqdn",
100 |     "ssl_ratio",
101 |     "in_alexa_top1m",
102 |     "in_alexa_top100",
103 |     "number_of_non_alphanumeric_chars",
104 |     "x509_ssl_ratio",
105 |     "number_of_flows",
106 | 
107 |     "periodicity_standart_deviation",
108 |     "SNI_ssl_ratio",
109 |     "length_of_longest_subdomain_name",
110 |     "FQDN_length",
111 |     "number_of_domains_in_certificate",
112 |     "number_of_different_chars_in_fqdn",
113 |     "percent_of_standard_deviation_duration",
114 |     "domain_name_length",
115 |     "ratio_is_same_CN_and_SNI",
116 |     "number_of_certificate_path"
117 | ]
118 | 
119 | def read_features(filename, features_name):
120 |     import pandas as pd
121 |     X = pd.read_csv(filename)
122 |     return X[features_name]
123 | 
124 | 
125 | def read_labels(filename):
126 |     with open(filename, 'r') as csvfile:
127 |         csvreader = csv.reader(csvfile, lineterminator='\n', delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
128 |         y = csvreader.next()
129 |     return y
130 | 
131 | 
132 | def get_features_name():
133 |     with open(c.model_folder + "features.csv", 'r') as csvfile:
134 |         csvreader = csv.reader(csvfile, lineterminator='\n', delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
135 |         headers = csvreader.next()
136 |     return headers[1:-1]
137 | 
138 | 
139 | def get_all_data(models_folder, set_name="all"):
140 |     featuresname_all = get_features_name()
141 |     features_set = {
142 |         "all": featuresname_all[:63+1],
143 |         "https": featuresname_all[:41],
144 |         "dns": featuresname_all[41:63+1],
145 |         "reduced": filter(lambda f: f not in less_important_features[:20], featuresname_all[:63+1]),
146 |         "reduced_30": filter(lambda f: f not in less_important_features[:30], featuresname_all[:63+1]),
147 |         "reduced_40": filter(lambda f: f not in less_important_features[:40], featuresname_all[:63+1]),
148 |         "enhanced_30": filter(lambda f: f not in less_important_features[:30], featuresname_all)
149 |     }
150 | 
151 |     X_train = read_features(models_folder + "X_train.csv", features_set[set_name])
152 |     X_test = read_features(models_folder + "X_test.csv", features_set[set_name])
153 |     y_train = read_labels(models_folder + "y_train.csv")
154 |     y_test = read_labels(models_folder + "y_test.csv")
155 |     #return np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)
156 |     return X_train, X_test, y_train, y_test


--------------------------------------------------------------------------------
/machine_learning/__init.py__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lminy/BotnetDetectionThesis/5a54541229a6d7255f0eebe65aaf8b1c35b9be04/machine_learning/__init.py__.py


--------------------------------------------------------------------------------
/machine_learning/features_selection.py:
--------------------------------------------------------------------------------
 1 | import Get_normalize_data
 2 | import config as c
 3 | from logger import get_logger
 4 | 
 5 | # https://chrisalbon.com/machine_learning/feature_selection/anova_f-value_for_feature_selection/
 6 | 
 7 | 
 8 | def compare_quantitative_features(X, y): # ANOVA F-value
 9 |     from sklearn.feature_selection import SelectKBest
10 |     from sklearn.feature_selection import f_classif
11 | 
12 |     # Create an SelectKBest object to select features with two best ANOVA F-Values
13 |     fvalue_selector = SelectKBest(f_classif, k=10)
14 | 
15 |     # Apply the SelectKBest object to the features and target
16 |     X_kbest = fvalue_selector.fit_transform(X, y)
17 | 
18 |     print fvalue_selector.scores_
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     logger = get_logger("debug")
23 | 
24 |     X_train, X_test, y_train, y_test = Get_normalize_data.get_all_data(c.model_folder)
25 | 
26 |     compare_quantitative_features(X_train, y_train)
27 | 
28 | 


--------------------------------------------------------------------------------
/machine_learning/model.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
  3 | import main_tools
  4 | import math
  5 | import time
  6 | import pickle
  7 | 
  8 | from logger import get_logger
  9 | logger = get_logger("debug", True)
 10 | 
 11 | 
 12 | class Model(object):
 13 | 
 14 |     def __init__(self, name, classifier, param_grid=None):
 15 |         self.classifier = classifier
 16 |         self.name = name
 17 |         self.param_grid = param_grid
 18 | 
 19 |         self.tn = self.fp = self.fn = self.tp = -1
 20 |         self.metrics = OrderedDict()
 21 | 
 22 |         self.training_error = None
 23 |         self.is_trained = False
 24 | 
 25 |         self.score = None
 26 |         self.y_pred = None
 27 | 
 28 |     def train(self, X_train, y_train, random=False):
 29 |         if self.param_grid is not None and random is False:
 30 |             self.classifier = GridSearchCV(self.classifier, self.param_grid, cv=10, scoring='precision', n_jobs=-1)  # Do a 10-fold cross validation
 31 |         elif self.param_grid is not None and random is True:
 32 |             self.classifier = RandomizedSearchCV(self.classifier, param_distributions=self.param_grid,
 33 |                                                  n_iter=10, scoring='precision',
 34 |                                                  n_jobs=-1, cv=10, verbose=3, random_state=1001)
 35 | 
 36 |         logger.info('Training classifier {}'.format(self.name))
 37 |         main_tools.benchmark(self.classifier.fit, X_train, y_train) # fit the classifier with data
 38 |         logger.info('Trained classifier {}'.format(self.name))
 39 |         self.training_error = self.classifier.score(X_train, y_train)
 40 | 
 41 |         if self.param_grid is not None:
 42 |             logger.debug("Grid search best score = {}".format(self.classifier.best_score_))
 43 |             logger.debug("Grid search best estimator = {}".format(self.classifier.best_estimator_))
 44 |             logger.debug("Grid search cv results = {}".format(self.classifier.cv_results_))
 45 |         else:
 46 |             logger.debug("Model parameters = {}".format(self.classifier.get_params()))
 47 |         self.is_trained = True
 48 | 
 49 |     def predict(self, X_test, y_test):
 50 |         if not self.is_trained:
 51 |             raise Exception('Model not trained, please run train()')
 52 | 
 53 |         self.score = self.classifier.score(X_test, y_test)
 54 |         self.y_pred = [round(value) for value in self.classifier.predict(X_test)]	# Call predict on the estimator (with the best found parameters if Grid search).
 55 |         # Round is there is we have probabilities (like with XGBoost)
 56 | 
 57 |     def compute_metrics(self, y_test):
 58 |         if self.y_pred is None:
 59 |             raise Exception('No prediction found, please run predict()')
 60 | 
 61 |         from sklearn import metrics
 62 |         tn, fp, fn, tp = metrics.confusion_matrix(y_test, self.y_pred, labels=[0,1]).ravel()
 63 |         self.tn, self.fp, self.fn, self.tp = tn, fp, fn, tp
 64 | 
 65 |         logger.debug("tn={}, fp={}, fn={}, tp={}".format(tn, fp, fn, tp))
 66 | 
 67 |         tpr = -1 if tp <= 0 else float(tp) / (tp + fn)
 68 |         self.metrics["TPR"] = tpr  # True Positive Rate
 69 | 
 70 |         tnr = -1 if tn <= 0 else float(tn) / (fp + tn)
 71 |         self.metrics["TNR"] = tnr  # True Negative Rate
 72 | 
 73 |         fpr = -1 if tn <= 0 else float(fp) / (fp + tn)
 74 |         self.metrics["FPR"] = fpr  # False Positive Rate
 75 | 
 76 |         #fdr = -1 if tp <= 0 else float(fp) / (fp + tp)
 77 |         #self.metrics["FDR"] = fdr  # False Discovery Rate
 78 | 
 79 |         accuracy = -1 if tp <= 0 or tn <= 0 else float(tp + tn) / (tp + tn + fp + fn)
 80 |         self.metrics["Acc"] = accuracy
 81 | 
 82 |         error_rate = -1 if tp <= 0 or tn <= 0 else float(fp + fn) / (tp + fn + fp + tn)
 83 |         self.metrics["Err"] = error_rate
 84 | 
 85 |         precision = -1 if tp <= 0 else float(tp) / (tp + fp)
 86 |         self.metrics["Pre"] = precision
 87 | 
 88 |         f_measure = -1 if precision <= 0 else float(2 * precision * tpr) / (precision + tpr)
 89 |         self.metrics["F-M"] = f_measure
 90 | 
 91 |         mcc = -1 if tp <= 0 or tn <= 0 else float(tp * tn - fp * fn) / \
 92 |              math.sqrt(float(tp + fn) * (tp + fp) * (tn + fp) * (tn + fn))
 93 |         self.metrics["MCC"] = mcc  # Matthew Correlation Coefficient
 94 | 
 95 |         roc_fpr, roc_tpr, thresholds = metrics.roc_curve(y_test, self.y_pred)
 96 |         self.metrics["AUC"] = metrics.auc(roc_fpr, roc_tpr)
 97 | 
 98 |     def get_printable_metrics(self):
 99 |         if len(self.metrics) == 0:
100 |             raise Exception('No metrics found, please run compute_metrics()')
101 | 
102 |         """
103 |         from prettytable import PrettyTable
104 |         import operator
105 | 
106 |         headers = ['Model', 'Best score']
107 |         headers += self.metrics.keys()
108 | 
109 |         table = PrettyTable(headers)
110 |         content = [self.name, self.score]
111 |         content += [round(float(m), 3) for m in self.metrics.values()]
112 |         table.add_row(content)
113 | 
114 |         return table.get_string(sort_key=operator.itemgetter(2, 1), sortby="Best score", reversesort=True)
115 |         """
116 | 
117 |         headers = ['Exec time', 'Model', 'Best score']
118 |         headers += self.metrics.keys()
119 |         values = time.strftime("%Y-%m-%d_%H-%M-%S") + "\t" + "\t".join([self.name, str(self.score)] + map(str, self.metrics.values()))
120 |         return "\t".join(headers) + "\n" + values
121 | 
122 | 
123 | 
124 |     @staticmethod
125 |     def models_metric_summary(models):
126 |         #from prettytable import PrettyTable
127 |         #import operator
128 | 
129 |         headers = ['Model', 'Best score']
130 |         headers += models[0].metrics.keys()
131 |         """
132 |         table = PrettyTable(headers)
133 | 
134 |         for model in models:
135 |             if len(model.metrics) == 0:
136 |                 raise Exception('No metrics found for model "{}", please run compute_metrics()'.format(model.name))
137 |             content = [model.name, model.score]
138 |             content += [round(float(m), 3) for m in model.metrics.values()]
139 |             table.add_row(content)
140 | 
141 |         return table.get_string(sort_key=operator.itemgetter(2, 1), sortby="Best score", reversesort=True)
142 |         """
143 | 
144 |         values = ""
145 |         for model in models:
146 |             if len(model.metrics) == 0:
147 |                 raise Exception('No metrics found for model "{}", please run compute_metrics()'.format(model.name))
148 | 
149 |             values += "\t".join([model.name, str(model.score)] + map(str,model.metrics.values())) + "\n"
150 |         return "\t".join(headers) + "\n" + values
151 | 
152 |     def save(self, filename):
153 |         logger.info("Saving model to {}...".format(filename))
154 |         pickle.dump(self.classifier, open(filename, "wb"))
155 |         logger.info("Model saved to {}!".format(filename))
156 | 
157 |     def load(self, filename):
158 |         logger.info("Loading model from {}...".format(filename))
159 |         self.classifier = pickle.load(open(filename, "rb"))
160 |         logger.info("Model loaded from {}!".format(filename))
161 | 
162 | 


--------------------------------------------------------------------------------
/machine_learning/normalize_and_split.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | sys.path.insert(0, os.environ['HOME'] + '/BotnetDetectionThesis/')
  4 | 
  5 | from sklearn.model_selection import train_test_split
  6 | import config as c
  7 | import csv
  8 | import pandas as pd
  9 | 
 10 | 
 11 | def normalize_data(data):
 12 |     for i in range(0, len(data[0])):
 13 |         max = 0
 14 |         for j in range(len(data)):
 15 |             if max < data[j][i]:
 16 |                 max = data[j][i]
 17 |         if max != 0:
 18 |             for j in range(len(data)):
 19 |                 if data[j][i] != -1:
 20 |                     data[j][i] = data[j][i] / float(max)
 21 |     return data
 22 | 
 23 | 
 24 | def write_features(filename, data, features_name):
 25 |     df = pd.DataFrame(data, columns=features_name)
 26 |     df.to_csv(c.model_folder + filename, sep=',', encoding='utf-8', index=False)
 27 | 
 28 | 
 29 | def write_targets(file_name, data_list):
 30 |     index = 0
 31 |     import csv
 32 | 
 33 |     with open(c.model_folder + file_name, 'wb') as csvfile:
 34 |         writer = csv.writer(csvfile, lineterminator='\n', delimiter=',')  # fieldnames=features.keys(),
 35 |         writer.writerow(data_list)
 36 |         index += 1
 37 | 
 38 |     print file_name, "written lines:", index
 39 | 
 40 | 
 41 | def transform_label(label):
 42 |     label_number = -1
 43 |     if 'MALWARE' in label:
 44 |         label_number = 1
 45 |     elif "NORMAL" in label:
 46 |         label_number = 0
 47 |     else:
 48 |         print "The label is incorrect"
 49 | 
 50 |     return label_number
 51 | 
 52 | if __name__ == '__main__':
 53 |     malwares = 0
 54 |     normals = 0
 55 | 
 56 |     X = list()
 57 |     y = list()
 58 | 
 59 |     LIMIT = -1 # total nb_lines, -1 = NO LIMIT
 60 | 
 61 |     with open(c.model_folder + "features.csv", 'r') as csvfile:
 62 |         csvreader = csv.reader(csvfile, lineterminator='\n', delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
 63 |         headers = csvreader.next()
 64 |         print(headers)
 65 |         line_nb = 0
 66 | 
 67 |         for row in csvreader:
 68 |             target = transform_label(row[-1])
 69 |             if LIMIT != -1:
 70 |                 if target == 1 and malwares < LIMIT / 2:
 71 |                     malwares += 1
 72 |                 elif target == 0 and normals < LIMIT / 2:
 73 |                     normals += 1
 74 |                 else:
 75 |                     continue
 76 |             else:
 77 |                 malwares += target
 78 |                 normals += 1 if target == 0 else 0
 79 | 
 80 |             X.append(row[1:-1])  # exclude key (index 0) and label (index -1 = last index)
 81 |             y.append(target)
 82 |             line_nb += 1
 83 | 
 84 |     features_name = headers[1:-1]
 85 | 
 86 |     # normalize X
 87 |     norm_X = normalize_data(X)
 88 |     print "Size of X:", len(X)
 89 |     print "Malwares:", malwares
 90 |     print "Normals:", len(X) - malwares
 91 | 
 92 |     # split data by sklearn library
 93 |     X_train, X_test, y_train, y_test = train_test_split(norm_X, y, test_size=.2, random_state=35)
 94 | 
 95 |     # Write train data
 96 |     write_features('X_train.csv', X_train, features_name)
 97 |     write_targets('y_train.csv', y_train)
 98 | 
 99 |     # Write test data
100 |     write_features('X_test.csv', X_test, features_name)
101 |     write_targets('y_test.csv', y_test)
102 | 


--------------------------------------------------------------------------------
/machine_learning/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | sys.path.insert(0, os.environ['HOME'] + '/BotnetDetectionThesis/')
  4 | 
  5 | 
  6 | from sklearn.neighbors import KNeighborsClassifier
  7 | from sklearn.naive_bayes import MultinomialNB
  8 | from sklearn.naive_bayes import BernoulliNB
  9 | from sklearn.naive_bayes import GaussianNB
 10 | from sklearn.ensemble import RandomForestClassifier
 11 | from sklearn.ensemble import AdaBoostClassifier
 12 | from sklearn.linear_model import LogisticRegression
 13 | from xgboost import XGBClassifier
 14 | from sklearn import svm
 15 | from sklearn import tree
 16 | 
 17 | import numpy as np
 18 | import Get_normalize_data
 19 | import config as c
 20 | from logger import get_logger
 21 | from model import Model
 22 | 
 23 | 
 24 | def select_models(models, models_name):
 25 |     return [m for m in models if m.name in models_name]
 26 | 
 27 | 
 28 | def final_train(models, set_name):
 29 |     with open(c.training_output_file, 'a') as f:
 30 |         f.write("Features set : " + set_name + "\n")
 31 |         for model in models:
 32 |             model.train(X_train, y_train)
 33 |             model.predict(X_test, y_test)
 34 |             model.compute_metrics(y_test)
 35 |             logger.debug(model.get_printable_metrics())
 36 |             with open(c.training_output_file, 'a') as f:
 37 |                 f.write(model.get_printable_metrics() + "\n")
 38 | 
 39 |     logger.info("Features set : " + set_name)
 40 |     logger.info(Model.models_metric_summary(models))
 41 | 
 42 | 
 43 | def train(model, set_name, random=False):
 44 |     model.train(X_train, y_train, random)
 45 |     model.predict(X_test, y_test)
 46 |     model.compute_metrics(y_test)
 47 |     logger.debug(model.get_printable_metrics())
 48 |     model.save(c.model_folder + model.name + ".model")
 49 |     with open(c.training_output_file, 'a') as f:
 50 |         f.write("Features set : " + set_name + "\n")
 51 |         f.write(model.get_printable_metrics() + "\n")
 52 |     logger.info("Features set : " + set_name)
 53 |     logger.info(model.get_printable_metrics())
 54 | 
 55 | 
 56 | if __name__ == '__main__':
 57 |     logger = get_logger("debug", append=True)
 58 | 
 59 |     models = list()
 60 | 
 61 |     #k-NN Classifier
 62 |     name = "k-NN"
 63 |     classifier = KNeighborsClassifier(weights='uniform', n_jobs=-1)
 64 |     k_range = list(range(1, 31)) # list of parameter values to test
 65 |     param_grid = dict(n_neighbors=k_range)
 66 |     models.append(Model(name, classifier, param_grid))
 67 | 
 68 |     #Decision Tree
 69 |     name = "Decision tree"
 70 |     classifier = tree.DecisionTreeClassifier(criterion='entropy')
 71 |     d_range = list(range(1, 31)) # list of parameter values to test
 72 |     #s_range = list(range(2, 10))
 73 |     param_grid = dict(max_depth=d_range)#, min_samples_split=s_range)
 74 |     models.append(Model(name, classifier, param_grid))
 75 | 
 76 |     #Random Forest
 77 |     name = "Random forest"
 78 |     classifier = RandomForestClassifier(n_jobs=-1)
 79 |     d_range = list(range(1, 31)) # list of parameter values to test
 80 |     #s_range = list(range(2, 10))
 81 |     param_grid = dict(max_depth=d_range)#, min_samples_split=s_range)
 82 |     models.append(Model(name, classifier, param_grid))
 83 | 
 84 |     #Naive Bayes
 85 |     name = "NB - Gaussian"
 86 |     classifier = GaussianNB()
 87 |     gnb = Model(name, classifier)
 88 |     models.append(gnb)
 89 | 
 90 |     #AdaBoost
 91 |     name = "AdaBoost"
 92 |     classifier = AdaBoostClassifier(n_estimators=100)
 93 |     adaboost = Model(name, classifier)
 94 |     models.append(adaboost)
 95 | 
 96 |     #Logistic Regression
 97 |     name = "Log. Regression"
 98 |     classifier = LogisticRegression()
 99 |     param_grid = dict(C=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000])
100 |     log_reg = Model(name, classifier, param_grid)
101 |     models.append(log_reg)
102 | 
103 |     name = "Log. Reg l1"
104 |     classifier = LogisticRegression(penalty='l1')
105 |     param_grid = dict(C=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000])
106 |     log_reg_l1 = Model(name, classifier, param_grid)
107 |     models.append(log_reg_l1)
108 | 
109 | 
110 |     #Neural networks
111 |     from sklearn.neural_network import MLPClassifier
112 |     name = "Neural net"
113 |     #classifier = MLPClassifier(alpha=1)
114 |     #classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
115 |     classifier = MLPClassifier(solver='adam', alpha=1e-5, random_state=1) # from Strasak thesis
116 |     nn = Model(name, classifier)
117 |     models.append(nn)
118 | 
119 |     # SVM - Support Vector Machine
120 |     name = "SVM - SVC"
121 |     classifier = svm.SVC()
122 |     C_range = np.logspace(-2, 10, 13)
123 |     # print(C_range)
124 |     gamma_range = np.logspace(-9, 3, 13)
125 |     # print(gamma_range)
126 |     param_grid = dict(gamma=gamma_range, C=C_range)
127 |     models.append(Model(name, classifier, param_grid))
128 | 
129 |     name = "SVM - Linear"
130 |     classifier = svm.LinearSVC()
131 |     #C_range = range(1,200,50)
132 |     C_range = range(1,200,50)
133 |     param_grid = dict(C=C_range)
134 |     models.append(Model(name, classifier, param_grid))
135 |     
136 |     name = "NB - Multinomial"
137 |     classifier = MultinomialNB()
138 |     models.append(Model(name, classifier))
139 |     
140 |     name = "NB - Bernoulli"
141 |     classifier = BernoulliNB()
142 |     models.append(Model(name, classifier))
143 | 
144 | 
145 |     name = "XGBoost 1"
146 |     classifier = XGBClassifier(
147 |         learning_rate =0.1,
148 |         n_estimators=1000,
149 |         max_depth=10,
150 |         min_child_weight=1,
151 |         gamma=0,
152 |         subsample=0.8,
153 |         colsample_bytree=0.8,
154 |         objective= 'binary:logistic',
155 |         nthread=4,
156 |         scale_pos_weight=1,
157 |         seed=3)
158 |     models.append(Model(name,classifier))
159 | 
160 |     name = "XGBoost 2"
161 |     classifier = XGBClassifier(
162 |         learning_rate=0.1,
163 |         n_estimators=1000,
164 |         max_depth=3,
165 |         min_child_weight=5,
166 |         gamma=0.1,
167 |         subsample=0.8,
168 |         colsample_bytree=0.8,
169 |         objective='binary:logistic',
170 |         nthread=4,
171 |         scale_pos_weight=1,
172 |         seed=27)
173 |     models.append(Model(name, classifier))
174 | 
175 |     name = "XGBoost"
176 |     classifier = XGBClassifier(
177 |         learning_rate=0.1,
178 |         n_estimators=1000,
179 |         objective='binary:logistic',
180 |         nthread=4,
181 |         scale_pos_weight=1,
182 |         seed=27)
183 |     param_grid = {
184 |         'min_child_weight': [1, 5, 10],
185 |         'gamma': [0.5, 1, 1.5, 2, 5],
186 |         'subsample': [0.6, 0.8, 1.0],
187 |         'colsample_bytree': [0.6, 0.8, 1.0],
188 |         'max_depth': [3, 4, 5]
189 |     }
190 |     xgboost = Model(name, classifier, param_grid)
191 |     models.append(xgboost)
192 | 
193 |     name = "Tuned XGBoost"
194 |     classifier = XGBClassifier(
195 |         learning_rate=0.1,
196 |         n_estimators=1000,
197 |         objective='binary:logistic',
198 |         nthread=4,
199 |         scale_pos_weight=1,
200 |         seed=27)
201 |     param_grid = {
202 |         'max_depth': range(3,10,2),
203 |         'min_child_weight': range(1,6,2),
204 |         'gamma': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 1.5, 2, 5],
205 |         'subsample': [i/10.0 for i in range(5,11)],
206 |         'colsample_bytree': [i/10.0 for i in range(5,11)],
207 |         'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
208 |     }
209 |     tuned_xgboost = Model(name, classifier, param_grid)
210 |     models.append(tuned_xgboost)
211 |     
212 | 
213 |     name = "XGBoostBest"
214 |     classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
215 |                           colsample_bytree=0.6, gamma=0.5, learning_rate=0.1,
216 |                           max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
217 |                           n_estimators=1000, n_jobs=1, nthread=4, objective='binary:logistic',
218 |                           random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
219 |                           seed=27, silent=True, subsample=0.8)
220 |     xgboost_best = Model(name, classifier)
221 | 
222 | 
223 |     #all_models = models.keys()
224 |     models_to_train = ['XGBoost', 'k-NN', 'Decision tree', 'Random forest', 'NB - Gaussian','AdaBoost', 'Log. Regression', 'Neural net'] #, 'SVM - SVC']
225 | 
226 |     # set_name can be: all, dns, https, reduced, reduced_30, reduced_40, enhanced_30
227 |     set_name = "enhanced_30"
228 |     X_train, X_test, y_train, y_test = Get_normalize_data.get_all_data(c.model_folder, set_name)
229 | 
230 |     #final_train(select_models(models, models_to_train), set_name)
231 | 
232 |     train(tuned_xgboost, set_name, random=False)
233 | 
234 |     #train(log_reg_l1, set_name)
235 | 
236 |     #train(log_reg, set_name)
237 | 
238 |     #train(xgboost_best, set_name)
239 | 
240 |     #train(xgboost, set_name, False)
241 | 
242 |     #train(gnb, set_name)
243 | 
244 | 
245 | 


--------------------------------------------------------------------------------
/main_tools.py:
--------------------------------------------------------------------------------
 1 | from logger import get_logger
 2 | logger = get_logger("debug")
 3 | 
 4 | 
 5 | def benchmark(func, *params):
 6 |     import datetime
 7 |     import time
 8 |     start_time = time.time()
 9 |     return_value = func(*params) if params else func()
10 |     total_time = datetime.timedelta(seconds=time.time() - start_time)
11 |     logger.debug("Function {} - execution time : {}".format(func.__name__, total_time))
12 |     return return_value
13 | 


--------------------------------------------------------------------------------
/statistics/__init.py__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lminy/BotnetDetectionThesis/5a54541229a6d7255f0eebe65aaf8b1c35b9be04/statistics/__init.py__.py


--------------------------------------------------------------------------------
/statistics/datasets_statistics.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | sys.path.insert(0, os.environ['HOME'] + '/BotnetDetectionThesis/')
  4 | import config as c
  5 | 
  6 | 
  7 | def get_size_folder(start_path = '.'):
  8 |     total_size = 0
  9 |     for dirpath, dirnames, filenames in os.walk(start_path):
 10 |         for f in filenames:
 11 |             fp = os.path.join(dirpath, f)
 12 |             total_size += os.path.getsize(fp)
 13 |     return total_size
 14 | 
 15 | # https://stackoverflow.com/a/1094933
 16 | def sizeof_fmt(num, suffix='B'):
 17 |     for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
 18 |         if abs(num) < 1024.0:
 19 |             return "%3.1f%s%s" % (num, unit, suffix)
 20 |         num /= 1024.0
 21 |     return "%.1f%s%s" % (num, 'Yi', suffix)
 22 | 
 23 | def compute_malware_normal_packets(path_conn_label):
 24 |     normals = 0
 25 |     malwares = 0
 26 |     with open(path_conn_label) as f:
 27 |         for line in f:
 28 |             if line[0] == '#':
 29 |                 continue
 30 |             split_conn_line = line.split('\t')
 31 | 
 32 |             if len(split_conn_line) < 22:
 33 |                 continue
 34 | 
 35 |             label = split_conn_line[21]
 36 | 
 37 |             if 'From-Normal' in label:
 38 |                 normals += 1
 39 |             elif 'From-Botnet' in label:
 40 |                 malwares += 1
 41 | 
 42 |     return (normals, malwares)
 43 | 
 44 | 
 45 | size_normal_dataset = 0
 46 | size_ctu13_malware_dataset = 0
 47 | size_other_malware_dataset = 0
 48 | 
 49 | normal_dataset_normal_packets = 0
 50 | ctu13_malware_dataset_normal_packets = 0
 51 | other_malware_dataset_normal_packets = 0
 52 | 
 53 | normal_dataset_malware_packets = 0
 54 | ctu13_malware_dataset_malware_packets = 0
 55 | other_malware_dataset_malware_packets = 0
 56 | 
 57 | 
 58 | index = 0
 59 | for sub_set in os.listdir(c.datasets_folder):
 60 |     if sub_set.startswith(".") or not os.path.exists(c.datasets_folder + sub_set + '/bro/ssl.log'):
 61 |         continue
 62 |     print "--------------------------------------------------------"
 63 |     print "-------- #" + str(index) + " " + sub_set
 64 |     print "--------------------------------------------------------"
 65 | 
 66 | 
 67 |     dataset_bro_folder = c.datasets_folder + sub_set + '/bro/'
 68 |     dataset_size = get_size_folder(dataset_bro_folder)
 69 |     print "Size of dataset : " + str(sizeof_fmt(dataset_size))
 70 |     index += 1
 71 | 
 72 |     normals, malwares = compute_malware_normal_packets(dataset_bro_folder + 'conn_label.log')
 73 | 
 74 |     if sub_set.startswith("CTU-Normal-"):
 75 |         size_normal_dataset += dataset_size
 76 |         normal_dataset_normal_packets += normals
 77 |         normal_dataset_malware_packets += malwares
 78 |     elif sub_set.startswith("CTU-Malware-Capture-Botnet-") and 42 <= int(sub_set.split('-')[4]) <= 54:
 79 |         size_ctu13_malware_dataset += dataset_size
 80 |         ctu13_malware_dataset_normal_packets += normals
 81 |         ctu13_malware_dataset_malware_packets += malwares
 82 |     elif sub_set.startswith("CTU-Malware-Capture-Botnet-"):
 83 |         size_other_malware_dataset += dataset_size
 84 |         other_malware_dataset_normal_packets += normals
 85 |         other_malware_dataset_malware_packets +=malwares
 86 | 
 87 | 
 88 |     print "Normal packets: " + str(normals)
 89 |     print "Malware packets : " + str(malwares)
 90 | 
 91 | 
 92 | 
 93 | print "\n\n============================"
 94 | print "Size normal datasets : " + str(sizeof_fmt(size_normal_dataset))
 95 | print "\t>>> Normal packets : " + str(normal_dataset_normal_packets)
 96 | print "\t>>> Malware packets : " + str(normal_dataset_malware_packets)
 97 | print "Size CTU-13 malware datasets : " + str(sizeof_fmt(size_ctu13_malware_dataset))
 98 | print "\t>>> Normal packets : " + str(ctu13_malware_dataset_normal_packets)
 99 | print "\t>>> Malware packets : " + str(ctu13_malware_dataset_malware_packets)
100 | print "Size other malware datasets : " + str(sizeof_fmt(size_other_malware_dataset))
101 | print "\t>>> Normal packets : " + str(other_malware_dataset_normal_packets)
102 | print "\t>>> Malware packets : " + str(other_malware_dataset_malware_packets)
103 | print "\n------------------"
104 | print "  TOTAL Datasets"
105 | print "------------------"
106 | print "Total Size : " + str(sizeof_fmt(size_normal_dataset + size_ctu13_malware_dataset + size_other_malware_dataset))
107 | print "Total normal packets : " + str(normal_dataset_normal_packets + ctu13_malware_dataset_normal_packets + other_malware_dataset_normal_packets)
108 | print "Total malwares packets : " + str(normal_dataset_malware_packets + ctu13_malware_dataset_malware_packets + other_malware_dataset_malware_packets)
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/statistics/dns_features_statistics.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import config as c
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | import matplotlib
  7 | from collections import Counter
  8 | matplotlib.use('TkAgg')
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | df = pd.read_csv(c.model_folder + 'features.csv')
 12 | 
 13 | 
 14 | names = ["in_alexa_top100","in_alexa_top1k","in_alexa_top10k","in_alexa_top100k","in_alexa_top1m","not_in_alexa",
 15 |          "FQDN_length","domain_name_length","number_of_numerical_chars","number_of_non_alphanumeric_chars",
 16 |          "number_unique_IP_addresses_in_response","number_of_subdomains","average_ttls","min_ttls",
 17 |          "max_ttls","number_of_hyphens_in_fqdn","length_of_longest_subdomain_name","number_of_voyels_in_fqdn",
 18 |          "number_of_different_chars_in_fqdn","number_of_consonants_in_fqdn",
 19 |          "shannon_entropy_2ld","shannon_entropy_3ld","label"]
 20 | 
 21 | #print(type(normal.values))
 22 | #print v.ndim
 23 | 
 24 | #df.plot(kind='bar', stacked=True);
 25 | 
 26 | """
 27 | shannon_entropy_2ld_botnet = list()
 28 | shannon_entropy_2ld_normal = list()
 29 | 
 30 | with open(c.model_folder + 'features.csv') as csvfile:
 31 |     reader = csv.DictReader(csvfile)
 32 |     for row in reader:
 33 |         print row['key'], row['number_of_flows']
 34 |         if row['label'] == 'MALWARE':
 35 |             shannon_entropy_2ld_botnet.append(float(row['shannon_entropy_2ld']))
 36 |         else:
 37 |             shannon_entropy_2ld_normal.append(float(row['shannon_entropy_2ld']))
 38 | 
 39 | """
 40 | 
 41 | import matplotlib
 42 | matplotlib.use('TkAgg')
 43 | import matplotlib.pyplot as plt
 44 | 
 45 | 
 46 | def compute_stat_continue(feature_name, data):
 47 |     labels = ["Normal", "Botnet"]
 48 |     fig, ax = plt.subplots()
 49 |     ax.set_title('Feature ' + feature_name)
 50 |     ax.boxplot(data, labels=labels)
 51 | 
 52 |     fig.savefig(c.graphs_folder + feature_name + '.png')
 53 | 
 54 | 
 55 | features_continue = ["FQDN_length","domain_name_length","number_of_numerical_chars","number_of_non_alphanumeric_chars",
 56 |          "number_unique_IP_addresses_in_response","number_of_subdomains","average_ttls","min_ttls",
 57 |          "max_ttls","number_of_hyphens_in_fqdn","length_of_longest_subdomain_name","number_of_voyels_in_fqdn",
 58 |          "number_of_different_chars_in_fqdn","number_of_consonants_in_fqdn",
 59 |          "shannon_entropy_2ld","shannon_entropy_3ld"]
 60 | 
 61 | 
 62 | def plot_all_stat_continue():
 63 |     for feature_name in features_continue:
 64 |         normal = df.loc[df['label'] == 'NORMAL'][feature_name]
 65 |         malware = df.loc[df['label'] == 'MALWARE'][feature_name]
 66 |         data = [normal, malware]
 67 |         compute_stat_continue(feature_name, data)
 68 | 
 69 | def plot_alexa():
 70 |     # Example https://matplotlib.org/2.0.2/examples/api/barchart_demo.html
 71 | 
 72 |     features_alexa = ["in_alexa_top100", "in_alexa_top1k", "in_alexa_top10k", "in_alexa_top100k", "in_alexa_top1m",
 73 |                       "not_in_alexa"]
 74 |     features_names_siplified = ["top 100", "top 1k", "top 10k", "top 100k", " top 1m", "not"]
 75 | 
 76 |     normal_means = list()
 77 |     normal_std = list()
 78 | 
 79 |     malware_means = list()
 80 |     malware_std = list()
 81 |     for i in range(len(features_alexa)):
 82 |         normal = df.loc[df['label'] == 'NORMAL'][features_alexa[i]]
 83 |         malware = df.loc[df['label'] == 'MALWARE'][features_alexa[i]]
 84 |         normal_means.append(np.mean(normal))
 85 |         normal_std.append(np.std(normal))
 86 |         malware_means.append(np.mean(malware))
 87 |         malware_std.append(np.std(malware))
 88 | 
 89 | 
 90 | 
 91 |     N = len(features_alexa)
 92 |     men_means = (20, 35, 30, 35, 27)
 93 |     men_std = (2, 3, 4, 1, 2)
 94 | 
 95 |     ind = np.arange(N)  # the x locations for the groups
 96 |     width = 0.35       # the width of the bars
 97 | 
 98 |     fig, ax = plt.subplots()
 99 |     rects1 = ax.bar(ind, normal_means, width, color='g', yerr=normal_std)
100 | 
101 |     women_means = (25, 32, 34, 20, 25)
102 |     women_std = (3, 5, 2, 3, 3)
103 |     rects2 = ax.bar(ind + width, malware_means, width, color='r', yerr=malware_std)
104 | 
105 |     # add some text for labels, title and axes ticks
106 |     ax.set_ylabel('Scores')
107 |     ax.set_title('Scores by group and gender')
108 |     ax.set_xticks(ind + width / 2)
109 |     ax.set_xticklabels(features_names_siplified)
110 | 
111 |     ax.legend((rects1[0], rects2[0]), ('Normal', 'Botnet'))
112 |     fig.savefig(c.graphs_folder + "features_alexa" + '.png')
113 | 
114 | def plot_alexa2():
115 |     # Example https://matplotlib.org/2.0.2/examples/api/barchart_demo.html
116 | 
117 |     features_alexa = ["in_alexa_top100", "in_alexa_top1k", "in_alexa_top10k", "in_alexa_top100k", "in_alexa_top1m",
118 |                       "not_in_alexa"]
119 |     features_names_simplified = ["top 100", "top 1k", "top 10k", "top 100k", " top 1m", "not"]
120 | 
121 |     normal_percentage = list()
122 |     malware_percentage = list()
123 |     for i in range(len(features_alexa)):
124 |         normal = df.loc[df['label'] == 'NORMAL'][features_alexa[i]]
125 |         malware = df.loc[df['label'] == 'MALWARE'][features_alexa[i]]
126 |         normal_percentage.append(sum(normal) / float(len(df)))
127 |         malware_percentage.append(sum(malware) / float(len(df)))
128 | 
129 |     N = len(features_alexa)
130 | 
131 |     ind = np.arange(N)  # the x locations for the groups
132 |     width = 0.35       # the width of the bars
133 | 
134 |     fig, ax = plt.subplots()
135 |     rects1 = ax.bar(ind, normal_percentage, width, color='g')
136 | 
137 |     rects2 = ax.bar(ind + width, malware_percentage, width, color='r')
138 | 
139 |     # add some text for labels, title and axes ticks
140 |     ax.set_ylabel('Percentage of flows')
141 |     ax.set_title('Flows in Top alexa')
142 |     ax.set_xticks(ind + width / 2)
143 |     ax.set_xticklabels(features_names_simplified)
144 | 
145 |     ax.legend((rects1[0], rects2[0]), ('Normal', 'Botnet'))
146 |     fig.savefig(c.graphs_folder + "features_alexa2" + '.png')
147 | 
148 | 
149 | def plot_barchar(features_name):
150 |     # Example https://matplotlib.org/2.0.2/examples/api/barchart_demo.html
151 | 
152 |     normal = df.loc[df['label'] == 'NORMAL'][features_name]
153 |     malware = df.loc[df['label'] == 'MALWARE'][features_name]
154 |     c = Counter(normal)
155 |     print c.most_common(15)
156 |     return
157 | 
158 | 
159 | 
160 |     features_alexa = ["in_alexa_top100", "in_alexa_top1k", "in_alexa_top10k", "in_alexa_top100k", "in_alexa_top1m",
161 |                       "not_in_alexa"]
162 |     features_names_simplified = ["top 100", "top 1k", "top 10k", "top 100k", " top 1m", "not"]
163 | 
164 |     normal_means = list()
165 |     normal_std = list()
166 | 
167 |     malware_means = list()
168 |     malware_std = list()
169 |     for i in range(len(features_alexa)):
170 |         normal = df.loc[df['label'] == 'NORMAL'][features_alexa[i]]
171 |         malware = df.loc[df['label'] == 'MALWARE'][features_alexa[i]]
172 |         normal_means.append(np.mean(normal))
173 |         normal_std.append(np.std(normal))
174 |         malware_means.append(np.mean(malware))
175 |         malware_std.append(np.std(malware))
176 | 
177 | 
178 | 
179 |     N = len(features_alexa)
180 |     men_means = (20, 35, 30, 35, 27)
181 |     men_std = (2, 3, 4, 1, 2)
182 | 
183 |     ind = np.arange(N)  # the x locations for the groups
184 |     width = 0.35       # the width of the bars
185 | 
186 |     fig, ax = plt.subplots()
187 |     rects1 = ax.bar(ind, normal_means, width, color='g', yerr=normal_std)
188 | 
189 |     women_means = (25, 32, 34, 20, 25)
190 |     women_std = (3, 5, 2, 3, 3)
191 |     rects2 = ax.bar(ind + width, malware_means, width, color='r', yerr=malware_std)
192 | 
193 |     # add some text for labels, title and axes ticks
194 |     ax.set_ylabel('Scores')
195 |     ax.set_title('Scores by group and gender')
196 |     ax.set_xticks(ind + width / 2)
197 |     ax.set_xticklabels(features_names_simplified)
198 | 
199 |     ax.legend((rects1[0], rects2[0]), ('Normal', 'Botnet'))
200 |     fig.savefig(c.graphs_folder + "features_alexa" + '.png')
201 | 
202 | plot_all_stat_continue()
203 | plot_alexa2()
204 | plot_barchar("number_unique_IP_addresses_in_response")
205 | 
206 | nb_flows_normal = len(df.loc[df['label'] == 'NORMAL'])
207 | nb_flows_malware = len(df.loc[df['label'] == 'MALWARE'])
208 | print "Number of flows Normal : " + str(nb_flows_normal)
209 | print "Number of flows Malware : " + str(nb_flows_malware)
210 | print "Total : " + str(nb_flows_normal + nb_flows_malware)
211 | 


--------------------------------------------------------------------------------
/tools/__init.py__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lminy/BotnetDetectionThesis/5a54541229a6d7255f0eebe65aaf8b1c35b9be04/tools/__init.py__.py


--------------------------------------------------------------------------------
/tools/backup_results.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.insert(0, os.environ['HOME'] + '/BotnetDetectionThesis/')
 4 | import shutil
 5 | import config as c
 6 | import time
 7 | from logger import get_logger
 8 | 
 9 | logger = get_logger('debug')
10 | 
11 | 
12 | def copytree(src, dst, symlinks=False, ignore=None):
13 |     for item in os.listdir(src):
14 |         s = os.path.join(src, item)
15 |         d = os.path.join(dst, item)
16 |         if os.path.isdir(s):
17 |             shutil.copytree(s, d, symlinks, ignore)
18 |         else:
19 |             shutil.copy2(s, d)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     src = c.results_folder
24 |     dst = c.results_folder_backup + "result_" + time.strftime("%Y-%m-%d_%H-%M-%S")
25 | 
26 |     copytree(src, dst)
27 |     logger.info("Results backuped to {}".format(dst))
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/tools/check_IP.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def is_ipv4(str):
 3 |     l = str.split('.')
 4 |     if len(l) != 4:
 5 |         return False
 6 |     try:
 7 |         ip = map(int, l)
 8 |     except ValueError:
 9 |         return False
10 |     if len(filter(lambda x: 0 <= x <= 255, ip)) == 4:
11 |         return True
12 |     return False
13 | 
14 | # True
15 | print is_ipv4("192.168.1.1")
16 | print is_ipv4("0.0.0.0")
17 | print is_ipv4("255.255.255.255")
18 | 
19 | # False
20 | print is_ipv4("255.255.255")
21 | print is_ipv4("255.255.255.255.3")
22 | print is_ipv4("255.255.255.erzr")
23 | 
24 | 


--------------------------------------------------------------------------------
/tools/delete_results.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.insert(0, os.environ['HOME'] + '/BotnetDetectionThesis/')
 4 | 
 5 | import config as c
 6 | import shutil
 7 | from logger import get_logger
 8 | 
 9 | logger = get_logger('debug')
10 | 
11 | if __name__ == '__main__':
12 | 
13 |     for folder in os.listdir(c.results_folder):
14 |         for file in os.listdir(c.results_folder + folder + "/"):
15 |             filename = c.results_folder + folder + "/" + file
16 |             os.remove(filename)
17 |             logger.info("File deleted : {}".format(filename))


--------------------------------------------------------------------------------
/tools/download_datasets_gdrive.sh:
--------------------------------------------------------------------------------
1 | FILEID=$1
2 | FILENAME=$2
3 | 
4 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=$FILEID" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$FILEID" -O $FILENAME && rm -rf /tmp/cookies.txt


--------------------------------------------------------------------------------
/tools/entropy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import math
 3 | 
 4 | 
 5 | """
 6 | def entropy(vals):
 7 |     sum = 0.0
 8 |     norm = 0.0
 9 |     for v in vals:
10 |         norm += v
11 |         vals = [v/norm for v in vals]
12 |     for v in vals:
13 |         sum += (v*np.log(v))
14 |         return -1.0 * sum
15 | 
16 | 
17 | def entropy(X):
18 |     probs = [np.mean(X == c) for c in set(X)]
19 |     return np.sum(-p * np.log2(p) for p in probs)
20 | """
21 | 
22 | 
23 | def entropy(string):
24 |     "Calculates the Shannon entropy of a string"
25 | 
26 |     # get probability of chars in string
27 |     prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
28 | 
29 |     # calculate the entropy
30 |     entropy = - sum([p * math.log(p) / math.log(2.0) for p in prob])
31 | 
32 |     return entropy
33 | 
34 | print entropy("huhjkhuihjilhnuohy.www.google.com")


--------------------------------------------------------------------------------
/tools/extract_bro_ciphers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | datasets_folder = "/mnt/hgfs/datasets"
 5 | 
 6 | for dataset_name in os.listdir(datasets_folder):
 7 |     if dataset_name.startswith("."):
 8 |         continue
 9 |     for filename in os.listdir(datasets_folder + dataset_name):
10 |         if filename.endswith('.pcap'):
11 |             pcap_fullpath = datasets_folder + dataset_name + "/" + filename
12 |             print("Extracting ciphers for {}...".format(pcap_fullpath))
13 |             working_dir = datasets_folder + dataset_name + "/bro_ciphers/"
14 |             os.mkdir(datasets_folder + dataset_name + "/bro_ciphers/")
15 | 
16 |             subprocess.Popen(["bro", "-C", "-r", "../"+filename, "-b", "base/protocols/ssl", "site/tls_finger"], cwd=working_dir).wait()


--------------------------------------------------------------------------------
/tools/generate_features_table.py:
--------------------------------------------------------------------------------
 1 | import config as c
 2 | import csv
 3 | from collections import OrderedDict
 4 | 
 5 | 
 6 | def get_features_name():
 7 |     with open(c.model_folder + "features.csv", 'r') as csvfile:
 8 |         csvreader = csv.reader(csvfile, lineterminator='\n', delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
 9 |         features_name = csvreader.next()[1:-1]
10 |         return features_name
11 | 
12 | if __name__ == '__main__':
13 | 
14 |     features_name = get_features_name()
15 | 
16 |     #latex_table = "\\begin{table}[!h]\n" \
17 |     #              "\centering\n" \
18 |     #              "\\begin{adjustbox}{max width=\\textwidth}\n" \
19 |     #              "\\begin{tabular}{llll}\n"
20 |     headers = "\\textbf{{{}}} & \\textbf{{{}}} & \\textbf{{{}}} & \\textbf{{{}}} \\\\ \n\hline \n".format("", "ID", "Feature name", "proposed in")
21 |     # latex_table += headers
22 | 
23 |     # Long table
24 |     latex_table = "\\begin{longtable}{llll}\n"
25 |     latex_table += headers
26 |     latex_table += "\endhead\n" \
27 |                    "\endfoot\n" \
28 | 
29 |     dns_features_name = features_name[41:]
30 |     ordered_dns = OrderedDict().fromkeys(dns_features_name)
31 |     dns_features_references = {
32 |         'number_of_different_chars_in_fqdn': "marques2017thesis",
33 |          'number_of_hyphens_in_fqdn': "wang2015breakingbad",
34 |          'shannon_entropy_3ld': "marques2017thesis",
35 |          'number_of_voyels_in_fqdn': "aashna2017dga",
36 |          'in_alexa_top100': "anderson2016identifying",
37 |          'number_of_subdomains': "hao2017exploring",
38 |          'not_in_alexa': "anderson2016identifying",
39 |          'min_ttls': "marques2017thesis",
40 |          'shannon_entropy_2ld': "marques2017thesis",
41 |          'length_of_longest_subdomain_name': "hao2017exploring",
42 |          'in_alexa_top1m': "anderson2016identifying",
43 |          'in_alexa_top1k': "anderson2016identifying",
44 |          'in_alexa_top100k': "anderson2016identifying",
45 |          'average_ttls': None,
46 |          'in_alexa_top10k': "anderson2016identifying",
47 |          'number_of_numerical_chars': "wang2015breakingbad,marques2017thesis,anderson2016identifying",
48 |          'number_unique_IP_addresses_in_response': "marques2017thesis,anderson2016identifying",
49 |          'std_ttls': None,
50 |          'max_ttls': "marques2017thesis",
51 |          'number_of_non_alphanumeric_chars': "anderson2016identifying",
52 |          'FQDN_length': "hao2017exploring,aashna2017dga,anderson2016identifying",
53 |          'number_of_consonants_in_fqdn': "aashna2017dga,marques2017thesis",
54 |          'domain_name_length': "wang2015breakingbad,marques2017thesis,anderson2016identifying"}
55 |     ordered_dns.update(dns_features_references)
56 |     #for i, feature in enumerate(dns_features_name):
57 |     #    latex_table += "{} & F{} & {} & {} \\\\ \n".format("",i,feature.replace("_", " "),"")
58 | 
59 |     i = 41
60 |     for f, papers in ordered_dns.iteritems():
61 |         latex_table += "{} & F{} & {} & {} \\\\ \n".format("",i,f.replace("_", " "),"" if papers is None else "\\cite{" + papers + "}")
62 |         i += 1
63 | 
64 |     #latex_table += "\end{tabular}\n" \
65 |     #               "\end{adjustbox}\n" \
66 |     #               "\caption{DNS Features}\n" \
67 |     #               "\label{table:dns_features}\n" \
68 |     #               "\end{table}\n"
69 |     # Long table
70 |     latex_table += "\caption{DNS Features}\label{table:dns_features}\\\\ \n" \
71 |                    "\end{longtable}"
72 | 
73 | 
74 |     print latex_table
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/tools/generate_results_table.py:
--------------------------------------------------------------------------------
  1 | import config as c
  2 | import csv
  3 | import string
  4 | 
  5 | 
  6 | def generate_table(results, headers, caption, label):
  7 |     latex_table = "\\begin{table}[!h]\n" \
  8 |                   "\centering\n" \
  9 |                   "\\begin{adjustbox}{max width=\\textwidth}\n" \
 10 |                   "\\begin{tabular}{lllllllllll}\n"
 11 | 
 12 |     latex_table += " & ".join(map(lambda h: "\\textbf{{{}}}".format(h), headers))
 13 |     latex_table += "\\\\ \hline \n"
 14 | 
 15 |     for line in results.split("\n"):
 16 |         line = map(string.strip, line.split("\t"))
 17 |         latex_table += line[0]
 18 |         for e in line[1:]:
 19 |             latex_table += " & " + str(round(float(e), 3))
 20 |         latex_table += ""
 21 |         latex_table += " \\\\ \n"
 22 | 
 23 |     latex_table += "\end{tabular}\n" \
 24 |                    "\end{adjustbox}\n"
 25 |     latex_table += "\caption{{{}}}\n".format(caption)
 26 |     latex_table += "\label{{{}}}\n".format(label)
 27 |     latex_table += "\end{table}\n"
 28 | 
 29 |     return latex_table
 30 | 
 31 | 
 32 | def generate_summary_table(results_https, results_https_dns, results_enhanced_30, headers, caption, label):
 33 |     models_order = ["XGBoost", "Random forest", "AdaBoost", "Log. Regression", "Neural net", "NB - Gaussian", "k-NN", "Decision tree"]
 34 | 
 35 |     https = dict()
 36 |     for line in results_https.split("\n"):
 37 |         line = map(string.strip, line.split("\t"))
 38 |         https[line[0]] = [round(float(l),3) for l in line[1:]]
 39 | 
 40 |     https_dns = dict()
 41 |     for line in results_https_dns.split("\n"):
 42 |         line = map(string.strip, line.split("\t"))
 43 |         https_dns[line[0]] = [round(float(l),3) for l in line[1:]]
 44 | 
 45 |     enhanced_30 = dict()
 46 |     for line in results_enhanced_30.split("\n"):
 47 |         line = map(string.strip, line.split("\t"))
 48 |         enhanced_30[line[0]] = [round(float(l),3) for l in line[1:]]
 49 | 
 50 |     latex_table = "\\begin{table}[htbp]\n"
 51 |     latex_table += "\\begin{center}\n" \
 52 |         "\\begin{tabular}{l|ll||ll||ll|} %l:left c:center r:right |:table lines\n" \
 53 |         "\cmidrule[1pt]{2-7} % 1pt is the thickness 3-10 is column number\n" \
 54 |         "&\multicolumn{2}{c||}{HTTPS}&\multicolumn{2}{c||}{HTTPS + DNS}&\multicolumn{2}{c|}{Enhanced} \\\\ \\cmidrule{2-7}\n" \
 55 |         "&\multicolumn{1}{c|}{Acc}&\multicolumn{1}{c||}{FPR}&\multicolumn{1}{c|}{Acc}&\multicolumn{1}{c||}{FPR}&\multicolumn{1}{c|}{Acc}&\multicolumn{1}{c|}{FPR}\\\\ \\midrule \n"
 56 | 
 57 |     for model in models_order:
 58 |         latex_table += "{} & {:.3f} & {:.3f} & {:.3f} & {:.3f} & {:.3f} & {:.3f} \\\\ \n".format(model, https[model][4], https[model][3], https_dns[model][4], https_dns[model][3], enhanced_30[model][4], enhanced_30[model][3])
 59 | 
 60 |     latex_table += "\\midrule\end{tabular}\n" \
 61 |                    "\end{center}"
 62 |     latex_table += "\caption{{{}}}\n".format(caption)
 63 |     latex_table += "\label{{{}}}\n".format(label)
 64 |     latex_table += "\end{table}\n"
 65 | 
 66 |     return latex_table
 67 | 
 68 | if __name__ == '__main__':
 69 | 
 70 |     headers = ['Model', 'Best score', 'TPR', 'TNR', 'FPR', 'Acc', 'Err', 'Pre', 'F-M', 'MCC', 'AUC']
 71 | 
 72 |     # https
 73 |     results_https = "XGBoost	0.9853618866901599	0.984	0.987	0.013	0.985	0.015	0.987	0.985	0.971	0.985\n \
 74 |         Random forest	0.97289238276	0.969	0.977	0.023	0.973	0.027	0.977	0.973	0.946	0.973\n \
 75 |         Decision tree	0.955543507726	0.95	0.961	0.039	0.956	0.044	0.961	0.955	0.911	0.956\n \
 76 |         AdaBoost	0.951206288967	0.952	0.95	0.05	0.951	0.049	0.95	0.951	0.902	0.951\n \
 77 |         k-NN	0.880726484142	0.871	0.89	0.11	0.881	0.119	0.888	0.88	0.762	0.881\n \
 78 |         Neural net	0.8359989156953104	0.862	0.81	0.19	0.836	0.164	0.819	0.84	0.673	0.836\n \
 79 |         Log. Regression	0.817565735972	0.796	0.839	0.161	0.818	0.182	0.832	0.813	0.636	0.818\n \
 80 |         NB - Gaussian	0.5917592843589049	0.234	0.949	0.051	0.592	0.408	0.821	0.364	0.262	0.591"
 81 | 
 82 |     # https + dns
 83 |     results_https_dns = "XGBoost	0.9886148007590133	0.985	0.992	0.008	0.989	0.011	0.992	0.989	0.977	0.989\n \
 84 |         AdaBoost\t0.970452697208	0.971	0.97	0.03	0.97	0.03	0.97	0.97	0.941	0.97\n \
 85 |         Random forest	0.969910544863	0.958	0.982	0.018	0.97	0.03	0.982	0.97	0.94	0.97\n \
 86 |         Decision tree	0.956627812415	0.946	0.967	0.033	0.957	0.043	0.967	0.956	0.913	0.957\n \
 87 |         k-NN	0.905394415831	0.893	0.918	0.082	0.905	0.095	0.916	0.904	0.811	0.905\n \
 88 |         Neural net	0.9024125779343996	0.939	0.866	0.134	0.902	0.098	0.875	0.906	0.807	0.902\n \
 89 |         Log. Regression\t0.877744646246	0.863	0.893	0.107	0.878	0.122	0.889	0.876	0.756	0.878\n \
 90 |         NB - Gaussian	0.750338845216	0.555	0.945	0.055	0.75	0.25	0.91	0.69	0.544	0.75\n"
 91 | 
 92 |     # enhanced_30 feature set
 93 |     results_enhanced_30 = "XGBoost	0.999	0.995	0.999	0.001	0.997	0.003	0.999	0.997	0.994	0.997\n \
 94 |         Random forest	0.999453850355	0.993	0.999	0.001	0.996	0.004	0.999	0.996	0.992	0.996\n \
 95 |         Decision tree	0.999437570304	0.964	0.999	0.001	0.982	0.018	0.999	0.981	0.964	0.982\n \
 96 |         k-NN	0.998904709748	0.99	0.999	0.001	0.994	0.006	0.999	0.994	0.989	0.994\n \
 97 |         AdaBoost	0.997018162104	0.995	0.999	0.001	0.997	0.003	0.999	0.997	0.994	0.997\n \
 98 |         Neural net	0.996476009759	0.993	0.999	0.001	0.996	0.004	0.999	0.996	0.993	0.996\n \
 99 |         Log. Regression	0.995662781242	0.992	0.999	0.001	0.996	0.004	0.999	0.996	0.991	0.996\n \
100 |         NB - Gaussian	0.994578476552	0.993	0.996	0.004	0.995	0.005	0.996	0.995	0.989	0.995\n \
101 |         "
102 |     results_enhanced_30 = "XGBoost	0.9994547437295529	0.994574064026	0.999458288191	0.000541711809317	0.997018162104	0.00298183789645	0.99945474373	0.997008430786	0.994048130058	0.9970161761083636\n \
103 |         Log. Regression	0.9994532531437944	0.991861096039	0.999458288191	0.000541711809317	0.995662781242	0.00433721875847	0.999453253144	0.995642701525	0.991354060016	0.9956596921148746\n \
104 |         Decision tree	0.999437570304	0.964188822572	0.999458288191	0.000541711809317	0.981837896449	0.0181621035511	0.999437570304	0.981496824082	0.964273690987	0.981823555381\n \
105 |         Random forest	0.998910675381	0.995116657623	0.998916576381	0.00108342361863	0.997018162104	0.00298183789645	0.998910675381	0.997010057081	0.994043460307	0.997016617002\n \
106 |         k-NN	0.998904709748	0.989690721649	0.998916576381	0.00108342361863	0.99430740038	0.00569259962049	0.998904709748	0.994276369583	0.988656700454	0.994303649015\n \
107 |         AdaBoost	0.997018162104	0.994574064026	0.999458288191	0.000541711809317	0.997018162104	0.00298183789645	0.99945474373	0.997008430786	0.994048130058	0.997016176108\n \
108 |         Neural net	0.996476009759	0.993488876831	0.999458288191	0.000541711809317	0.996476009759	0.00352399024126	0.999454148472	0.996462585034	0.992969638722	0.996473582511\n \
109 |         NB - Gaussian	0.994578476552	0.993488876831	0.995666305525	0.00433369447454	0.994578476552	0.00542152344809	0.995649809679	0.994568169473	0.989159252766	0.994577591178\n"
110 | 
111 | 
112 |     #print(generate_table(results_https, headers, "Result HTTPS", "table:result_https"))
113 | 
114 |     #print(generate_table(results_https_dns, headers, "Result HTTPS + DNS", "table:result_https_dns"))
115 | 
116 |     #print(generate_table(results_enhanced_30, headers, "Result Enhanced features set", "table:result_enhanced_features_set"))
117 | 
118 |     print(generate_summary_table(results_https, results_https_dns, results_enhanced_30, headers, "Summary of the results", "table:result_summary"))
119 | 
120 | 


--------------------------------------------------------------------------------
/tools/split_alexa.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | def binarySearch(alist, item):
 4 |     first = 0
 5 |     last = len(alist)-1
 6 |     found = False
 7 | 
 8 |     while first<=last and not found:
 9 |         pos = 0
10 |         midpoint = (first + last)//2
11 |         if alist[midpoint] == item:
12 |             pos = midpoint
13 |             found = True
14 |         else:
15 |             if item < alist[midpoint]:
16 |                 last = midpoint-1
17 |             else:
18 |                 first = midpoint+1
19 |     return found
20 | 
21 | def sort_and_write(l, filename):
22 |     l_sorted = sorted(l, key=str.lower)
23 |     with open(filename, 'wb') as csvfile:
24 |         csvwriter = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_MINIMAL)
25 |         csvwriter.writerow(l_sorted)
26 | 
27 | 
28 | 
29 | with open('alexa.csv', 'rb') as csvfile:
30 |     csvreader = csv.reader(csvfile, delimiter=',')
31 |     alexa = list()
32 |     for row in csvreader:
33 |         alexa.append(row[1])
34 | 
35 | #sort_and_write(alexa[:100], 'top-100.csv')
36 | #sort_and_write(alexa[100:1000], 'top-101-1000.csv')
37 | #sort_and_write(alexa[1000:10000], 'top-1001-10000.csv')
38 | #sort_and_write(alexa[10000:100000], 'top-10001-100000.csv')
39 | #sort_and_write(alexa[100000:1000000], 'top-100001-1000000.csv')
40 | 
41 | alexa_sorted = sorted(alexa, key=str.lower)
42 | 
43 | 
44 | import time
45 | import datetime
46 | """
47 | start_time = time.time()
48 | binarySearch(alexa_sorted, "wikipedia.org")
49 | total_time = datetime.timedelta(seconds=time.time() - start_time)
50 | print("Binary search total time : " + str(total_time))#.strftime('%H:%M:%S'))
51 | 
52 | start_time = time.time()
53 | "wikipedia.org" in alexa_sorted
54 | total_time = datetime.timedelta(seconds=time.time() - start_time)
55 | print("In search total time : " + str(total_time))#.strftime('%H:%M:%S'))
56 | """
57 | 
58 | print binarySearch(alexa_sorted, "wikiphjhedia.org")
59 | 
60 | 


--------------------------------------------------------------------------------
/tools/timeFunction.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def benchmark(func, *params):
 4 |     import datetime
 5 |     import time
 6 |     start_time = time.time()
 7 |     return_value = func(*params) if params else func()
 8 |     total_time = datetime.timedelta(seconds=time.time() - start_time)
 9 |     print("Function " + func.__name__ + " - execution time : " + str(total_time))#.strftime('%H:%M:%S'))
10 |     return return_value
11 | 
12 | 
13 | def test():
14 |     total = 0
15 |     for i in range(0, 10000):
16 |         total +=i
17 |     return total
18 | 
19 | def sum(param1, param2):
20 |     return param1 + param2
21 | 
22 | print benchmark(sum, 1, 2)
23 | 
24 | print benchmark(test)


--------------------------------------------------------------------------------
/tools/tls_finger.bro:
--------------------------------------------------------------------------------
 1 | module TlsFingerprint;
 2 | 
 3 | ## This script comes from https://www.securityartwork.es/2017/02/02/tls-client-fingerprinting-with-bro/
 4 | 
 5 | export {
 6 |     redef enum Log::ID += { LOG };
 7 | 
 8 |     type Info: record {
 9 |         ts: time &log;
10 |         uid: string &log;
11 |         id: conn_id &log;
12 |         tls_version: string &log;
13 |         ciphers: vector of string &log;
14 |     };
15 | }
16 | 
17 | event bro_init() &priority=5
18 | {
19 |     Log::create_stream(TlsFingerprint::LOG, [$columns=Info, $path="tls_finger"]);
20 | }
21 | 
22 | event ssl_client_hello (c: connection,
23 |     version: count,
24 |     possible_ts: time,
25 |     client_random: string,
26 |     session_id: string,
27 |     ciphers: index_vec)
28 | {
29 |     local ciphers_str: vector of string;
30 |     local rec: TlsFingerprint::Info;
31 | 
32 |     for (i in ciphers) {
33 |         ciphers_str[i] = SSL::cipher_desc[ciphers[i]];
34 |     }
35 | 
36 |     rec$ts = network_time();
37 |     rec$uid = c$uid;
38 |     rec$id = c$id;
39 |     rec$tls_version = SSL::version_strings[version];
40 |     #rec$ciphers = join_string_vec(ciphers_str, ",");
41 |     rec$ciphers = ciphers_str;
42 | 
43 |     Log::write(TlsFingerprint::LOG, rec);
44 | }


--------------------------------------------------------------------------------