├── inv_app ├── __init__.py ├── build.sh ├── inv_app_crd_parse.Dockerfile ├── inv_app_crd_train.Dockerfile ├── inv_app_crd_parse.py └── inv_app_crd_train.py ├── inv_aut ├── __init__.py ├── build.sh ├── inv_aut_train.Dockerfile ├── inv_aut_parse.Dockerfile ├── inv_aut_parse.py └── inv_aut_train.py ├── clean_image.sh ├── data_split.Dockerfile ├── inv_sql ├── inv_sql_train.Dockerfile ├── build.sh ├── inv_sql_parse.Dockerfile ├── inv_sql_parse.py └── inv_sql_train.py ├── label_save.Dockerfile ├── data_save.Dockerfile ├── realtime_save.Dockerfile ├── data_backup.Dockerfile ├── data_backup.py ├── LICENSE ├── data_split.py ├── README.md ├── fsi_splunk.py ├── splunk_queries.py └── data_save.py /inv_app/__init__.py: -------------------------------------------------------------------------------- 1 | import data_parse, data_save -------------------------------------------------------------------------------- /inv_aut/__init__.py: -------------------------------------------------------------------------------- 1 | import data_parse, data_save -------------------------------------------------------------------------------- /clean_image.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | docker rmi $(docker images -f dangling=true -q) -------------------------------------------------------------------------------- /inv_app/build.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | docker build -f inv_app_parse.Dockerfile -t aisec:inv_app_parse . 3 | docker build -f inv_app_crd_train.Dockerfile -t aisec:inv_app_crd_train . -------------------------------------------------------------------------------- /data_split.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ai:python-common 2 | MAINTAINER Mohyun Park 3 | 4 | RUN mkdir /home/dockeruser/data 5 | COPY data_split.py /home/dockeruser/data_split.py 6 | 7 | CMD python3 data_split.py 8 | -------------------------------------------------------------------------------- /inv_sql/inv_sql_train.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ai:python-keras 2 | 3 | RUN mkdir /home/dockeruser/npy 4 | RUN mkdir /home/dockeruser/models 5 | 6 | COPY inv_sql_train.py /home/dockeruser/train.py 7 | 8 | CMD python3 train.py 9 | -------------------------------------------------------------------------------- /inv_sql/build.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | docker build -f inv_sql_parse.Dockerfile -t aisec:crd_inv_sql_parse . 3 | docker build -f inv_sql_train.Dockerfile -t aisec:crd_inv_sql_train . 4 | docker build -f inv_sql_predict.Dockerfile -t aisec:crd_inv_sql_predict . 5 | -------------------------------------------------------------------------------- /inv_aut/build.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | docker build -f inv_aut_parse.Dockerfile -t aisec:crd_inv_aut_parse . 3 | docker build -f inv_aut_train.Dockerfile -t aisec:crd_inv_aut_train . 4 | docker build -f inv_aut_predict.Dockerfile -t aisec:crd_inv_aut_predict . 5 | 6 | -------------------------------------------------------------------------------- /inv_aut/inv_aut_train.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ai:python-keras 2 | MAINTAINER Mohyun Park 3 | 4 | RUN mkdir /home/dockeruser/npy 5 | RUN mkdir /home/dockeruser/models 6 | 7 | COPY inv_aut_train.py /home/dockeruser/train.py 8 | 9 | CMD python3 train.py 10 | -------------------------------------------------------------------------------- /inv_sql/inv_sql_parse.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ai:python-common 2 | MAINTAINER HyeSeong Jeong 3 | 4 | RUN mkdir /home/dockeruser/data 5 | RUN mkdir /home/dockeruser/npy 6 | 7 | COPY inv_sql_parse.py /home/dockeruser/inv_sql_parse.py 8 | 9 | CMD python3 inv_sql_parse.py -------------------------------------------------------------------------------- /inv_aut/inv_aut_parse.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ai:python-common 2 | MAINTAINER Mohyun Park 3 | 4 | RUN mkdir /home/dockeruser/data 5 | RUN mkdir /home/dockeruser/npy 6 | 7 | COPY inv_app_parse.py /home/dockeruser/inv_app_parse.py 8 | 9 | CMD python3 inv_app_parse.py 10 | -------------------------------------------------------------------------------- /inv_app/inv_app_crd_parse.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ai:python-common 2 | MAINTAINER Aechan Kim 3 | 4 | RUN mkdir /home/dockeruser/data 5 | RUN mkdir /home/dockeruser/npy 6 | 7 | COPY inv_app_crd_parse.py /home/dockeruser/inv_app_crd_parse.py 8 | 9 | CMD python3 inv_app_crd_parse.py 10 | -------------------------------------------------------------------------------- /inv_app/inv_app_crd_train.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ai:python-keras 2 | MAINTAINER Aechan Kim 3 | 4 | RUN mkdir /home/dockeruser/npy 5 | RUN mkdir /home/dockeruser/models 6 | 7 | COPY inv_app_crd_train.py /home/dockeruser/inv_app_crd_train.py 8 | 9 | CMD python3 inv_app_crd_train.py 10 | -------------------------------------------------------------------------------- /label_save.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ai:python-common 2 | MAINTAINER Mohyun Park 3 | 4 | RUN mkdir /home/dockeruser/data 5 | COPY data_save.py /home/dockeruser/data_save.py 6 | COPY fsi_splunk.py /home/dockeruser/fsi_splunk.py 7 | COPY splunk_queries.py /home/dockeruser/splunk_queries.py 8 | 9 | CMD python3 data_save.py label 10 | -------------------------------------------------------------------------------- /data_save.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ai:python-common 2 | MAINTAINER Mohyun Park 3 | 4 | RUN mkdir /home/dockeruser/data 5 | COPY data_save.py /home/dockeruser/data_save.py 6 | COPY fsi_splunk.py /home/dockeruser/fsi_splunk.py 7 | COPY splunk_queries.py /home/dockeruser/splunk_queries.py 8 | 9 | CMD python3 data_save.py payload 10 | -------------------------------------------------------------------------------- /realtime_save.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ai:python-common 2 | MAINTAINER Mohyun Park 3 | 4 | RUN mkdir /home/dockeruser/data 5 | COPY data_save.py /home/dockeruser/data_save.py 6 | COPY fsi_splunk.py /home/dockeruser/fsi_splunk.py 7 | COPY splunk_queries.py /home/dockeruser/splunk_queries.py 8 | 9 | CMD python3 data_save.py realtime 10 | -------------------------------------------------------------------------------- /data_backup.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ai:python-common 2 | MAINTAINER Mohyun Park 3 | 4 | RUN mkdir /home/dockeruser/data 5 | RUN mkdir /home/dockeruser/prediction 6 | RUN mkdir /home/dockeruser/data_backup 7 | RUN mkdir /home/dockeruser/prediction_backup 8 | COPY data_backup.py /home/dockeruser/data_backup.py 9 | 10 | CMD python3 data_backup.py 11 | -------------------------------------------------------------------------------- /data_backup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import shutil 4 | import datetime 5 | 6 | 7 | def backup_data(data_dir, backup_dir, include_string, keep_days=7): 8 | 9 | past_day = datetime.datetime.today() + datetime.timedelta(days=-keep_days) 10 | past_day_string = past_day.strftime("%Y%m%d") 11 | 12 | for filename in os.listdir(data_dir): 13 | if past_day_string in filename and include_string in filename: 14 | from_path = os.path.join(data_dir, filename) 15 | to_path = os.path.join(backup_dir, filename) 16 | shutil.move(from_path, to_path) 17 | 18 | 19 | if __name__ == "__main__": 20 | backup_data("./data", "./data_backup", "total", keep_days=3) 21 | backup_data("./data", "./data_backup", "payload", keep_days=3) 22 | backup_data("./data", "./data_backup", "aut", keep_days=28) 23 | backup_data("./data", "./data_backup", "app", keep_days=28) 24 | backup_data("./data", "./data_backup", "sql", keep_days=28) 25 | backup_data("./prediction", "./prediction_backup", "", keep_days=7) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Financial Security Institite (FSI). 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /data_split.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import datetime 4 | 5 | 6 | def classify_payload(datafile_list, category_list): 7 | 8 | category_dict = { 9 | "payload" : "", 10 | "app" : "응용프로그램 취약점 공격", 11 | "aut" : "패스워드 추측 및 인증우회 공격", 12 | "sql" : "SQL Injection 공격" 13 | } 14 | 15 | for filename in datafile_list: 16 | dirname = os.path.dirname(filename) 17 | ymd = str(os.path.basename(filename).split('_')[0]) 18 | df = pd.read_csv(filename, header=0) 19 | df = df.fillna(value={"MID_CATE_NM": ""}) 20 | for category in category_list: 21 | new_filename = os.path.join(dirname, ymd + "_" + category + ".csvx") 22 | df[df["MID_CATE_NM"]==category_dict[category]].to_csv( 23 | new_filename, 24 | index=False, 25 | encoding="utf-8" 26 | ) 27 | os.umask(0) 28 | os.chmod(new_filename, 0o666) 29 | os.umask(0o027) 30 | 31 | 32 | def classify_label(labelfile_list): 33 | for filename in labelfile_list: 34 | dirname = os.path.dirname(filename) 35 | ymd = str(os.path.basename(filename).split('_')[0]) 36 | df = pd.read_csv(filename, header=0) 37 | for model_name in df.model_name.unique(): 38 | new_filename = os.path.join(dirname, ymd + "_" + model_name + ".csvx") 39 | df[df["model_name"]==model_name].to_csv( 40 | new_filename, 41 | index=False, 42 | encoding="utf-8" 43 | ) 44 | os.umask(0) 45 | os.chmod(new_filename, 0o666) 46 | os.umask(0o027) 47 | 48 | 49 | if __name__ == "__main__": 50 | 51 | mid_category_list = ["payload", "app", "aut", "sql"] 52 | day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2) 53 | day_string = day_before_yesterday.strftime("%Y%m%d") 54 | data_dir = "./data" 55 | file_list = list(os.path.join(data_dir, f) 56 | for f in os.listdir(data_dir) 57 | if day_string + "_total" in f) 58 | 59 | label_list = list(os.path.join(data_dir, f) 60 | for f in os.listdir(data_dir) 61 | if day_string + "_label" in f) 62 | 63 | classify_payload(file_list, mid_category_list) 64 | classify_label(label_list) 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # License 2 | Our AI-IDS software contains the following license and notice below: Licensed under the MIT License. 3 | 4 | # AI-IDS 5 | AI-IDS: Application of Deep Learning to Real-time Web Intrusion Detection 6 | 7 | We implemented and applied our Artificial Intelligence-based Intrusion Detection System (AI-IDS) to real-time web traffic for distinguishing sophisticated attacks such as unknown patterns, encoded payload or obfuscated attacks, from benign traffic. It also helps writing and improving Snort rules based on newly identified patterns. The AI-IDS is a flexible and scalable system that is implemented based on Docker images, separating user-defined functions by independent images. We designed a CNN-LSTM model structure based on normalized UTF-8 encoding in bigdata-scale web traffic. 8 | 9 | # payload_analysis 10 | This AI-IDS software that can be running in Splunk environment. 11 | 12 | More details: 13 | The paper "AI-IDS: Application of Deep Learning to Real-time Web Intrusion Detection" has been published in IEEE Access, Vol. 8, 2020. 14 | doi:10.1109/ACCESS.2020.2986882 15 | Authors: Aechan Kim, Mohyun Park, DongHoon Lee 16 | 17 | 18 | # Bio 19 | Aechan Kim (ackim@fsec.or.kr) is an assistant manager in Financial Security Institute (FSI), Yongin, South Korea. He received the B.S. degree in Industrial Engineering from Seoul National University of Science and Technology, Seoul, South Korea, in 2009, and the M.S. degree in financial information security from Korea University, Seoul, in 2014, where he is currently pursuing the Ph.D. degree in Graduate School of Information Security. 20 | 21 | Mohyun Park (mhpark@fsec.or.kr) is a manager in Financial Security Institute (FSI), Yongin, South Korea. He received the B.S. degree in Computer Science from Seoul National University, Seoul, South Korea, in 2013. 22 | 23 | Dong Hoon Lee (donghlee@korea.ac.kr) received the B.S. degree from the Department of Economics, Korea University, Seoul, in 1985, and the M.S. and Ph.D. degrees in computer science from The University of Oklahoma, Norman, in 1988 and 1992, respectively. Since 1993, he has been with the Faculty of Computer Science and Information Security, Korea University. He is currently a Professor and the Director of the Graduate School of Information Security, Korea University. 24 | 25 | # Acknowledgments 26 | This research was supported by Financial Security Institute (FSI), South Korea. 27 | -------------------------------------------------------------------------------- /fsi_splunk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import requests 3 | import time 4 | import re 5 | import xml.etree.ElementTree as et 6 | 7 | 8 | ######################################################### 9 | ### Splunk Query Setup # 10 | ### search_query (string) : Query # 11 | ### period (int) : duration for task completion (sec) # 12 | ### output_format (string) : outputfile format (csv, xml, json) # 13 | ### auth (string tuple) : (ID, PW) # 14 | ### output_count (int) : Number of ouput count # 15 | ### output file format : csv, xml, json # 16 | ### return (string) : result of query # 17 | ######################################################### 18 | def query(splunk_host, search_query, check_frequency, output_format, auth, sample_ratio=1, output_count=0): 19 | 20 | if not search_query.startswith('|'): 21 | 22 | if 'latest' not in search_query: 23 | search_query = 'latest=now ' + search_query 24 | 25 | if 'earliest' not in search_query: 26 | search_query = 'earliest=-15m@m ' + search_query 27 | 28 | if not search_query.startswith('search'): 29 | search_query = 'search ' + search_query 30 | 31 | 32 | if output_format not in ['csv', 'xml','json']: 33 | return '' 34 | 35 | splunk_job_url = splunk_host + "/services/search/jobs" 36 | search_response = requests.post(splunk_job_url, 37 | data = {'search':search_query, 38 | 'dispatch.sample_ratio':sample_ratio}, 39 | auth = auth, 40 | verify=False) 41 | # Job has been submitted. 42 | try: 43 | search_root = et.fromstring(search_response.text) 44 | splunk_sid = search_root.find('sid').text 45 | except AttributeError: 46 | print(search_response.text) 47 | exit(0) 48 | return None 49 | except et.ParseError: 50 | print(search_response.text) 51 | exit(0) 52 | 53 | while True: 54 | time.sleep(check_frequency) 55 | job_response = requests.get(splunk_job_url + '/' + splunk_sid, 56 | auth = auth, 57 | verify=False) 58 | 59 | job_status = re.search('(.+)', job_response.text).group(1) 60 | 61 | #job_root = et.fromstring(job_response.text) 62 | #ns = {'atom': 'http://www.w3.org/2005/Atom', 's': 'http://dev.splunk.com/ns/rest'} 63 | #job_status = job_root.find("./atom:content/s:dict/s:key[@name='dispatchState']", ns).text 64 | 65 | if job_status == 'DONE': # Job is finished 66 | break 67 | if job_status == 'FAILED': # Job is finished 68 | print('Search Failed!') 69 | print(job_response.text) 70 | exit(0) 71 | 72 | 73 | splunk_result = requests.get(splunk_job_url + '/' + splunk_sid 74 | + '/results?output_mode=' + output_format 75 | + '&count=' + str(output_count), 76 | auth = auth, 77 | verify=False) 78 | 79 | if ' 0: 116 | latest_notable = 0 117 | args = { 118 | "earliest_minute": str(earliest_minute), 119 | "latest_minute": str(latest_minute), 120 | "latest_notable": str(latest_notable), 121 | "top_category": top_category, 122 | "mid_category": mid_category 123 | } 124 | query = """ 125 | 126 | """ 127 | query = query.format(**args) 128 | 129 | return query 130 | 131 | 132 | def search_query_payload_test(earliest_minute=-1450, latest_minute=-1440): 133 | 134 | args = { 135 | "earliest_minute": str(earliest_minute), 136 | "latest_minute": str(latest_minute) 137 | } 138 | 139 | query = """ 140 | 141 | """ 142 | query = query.format(**args) 143 | 144 | return query 145 | 146 | -------------------------------------------------------------------------------- /data_save.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import datetime 4 | import sys 5 | import shutil 6 | 7 | 8 | def append_data(data_dir, data_type, splunk_query, credentials, headers, data_count=0, sample_ratio=1): 9 | 10 | yesterday = datetime.datetime.today() + datetime.timedelta(days=-1) 11 | 12 | yesterday_string = yesterday.strftime("%Y%m%d") 13 | new_filename = data_dir + yesterday_string + "_" + data_type + ".csvx" 14 | 15 | if not os.path.exists(new_filename): 16 | os.umask(0) 17 | with open(os.open(new_filename, os.O_CREAT | os.O_WRONLY, 0o666), 'a') as f: 18 | f.write(','.join(headers) + '\n') 19 | 20 | # 전일 데이터 입력 21 | splunk_data = None 22 | while splunk_data is None: 23 | splunk_data = fsi_splunk.query( 24 | "http://192.168.143.39:8089", 25 | splunk_query, 26 | check_frequency=10, 27 | output_format='csv', 28 | auth=credentials, 29 | sample_ratio=sample_ratio, 30 | output_count=data_count 31 | ) 32 | 33 | if len(splunk_data) > 0: 34 | with open(new_filename, 35 | mode='a', 36 | encoding='utf-8', 37 | newline='') as f: 38 | splunk_data = splunk_data[splunk_data.index('\n'):] 39 | f.write(splunk_data + "\n") 40 | 41 | 42 | def replace_data(data_path, splunk_query, credentials, headers, data_count=0, sample_ratio=1): 43 | 44 | splunk_data = None 45 | while splunk_data is None: 46 | splunk_data = fsi_splunk.query( 47 | "http://192.168.143.39:8089", 48 | splunk_query, 49 | check_frequency=10, 50 | output_format='csv', 51 | auth=credentials, 52 | output_count=data_count 53 | ) 54 | 55 | with open(data_path, 56 | mode='w', 57 | encoding='utf-8', 58 | newline='') as f: 59 | f.write(splunk_data + '\n') 60 | 61 | 62 | if __name__ == "__main__": 63 | import fsi_splunk 64 | from splunk_queries import search_query_total, search_query_label, search_query_payload 65 | 66 | credentials = ('airesearch', 'airflow!@') 67 | 68 | if sys.argv[-1] == "payload": 69 | 70 | payload_headers = [ 71 | "_time", 72 | "src_ip", 73 | "src_port", 74 | "dest_ip", 75 | "dest_port", 76 | "src_content", 77 | "TOP_CATE_NM", 78 | "MID_CATE_NM", 79 | "suppression", 80 | "desc", 81 | "drill", 82 | "msg", 83 | "label" 84 | ] 85 | 86 | append_data( 87 | "./data/", 88 | "total", 89 | search_query_total(headers=payload_headers), 90 | credentials=credentials, 91 | headers=payload_headers, 92 | data_count=500000, 93 | ) 94 | 95 | elif sys.argv[-1] == "label": 96 | 97 | label_headers = [ 98 | "_time", 99 | "src_ip", 100 | "src_port", 101 | "dest_ip", 102 | "dest_port", 103 | "src_content", 104 | "model_name", 105 | "label", 106 | "comment" 107 | ] 108 | 109 | append_data( 110 | "./data/", 111 | "label", 112 | search_query_label(earliest_minute=-1500, latest_minute=-1440, headers=label_headers), 113 | credentials=credentials, 114 | headers=label_headers, 115 | data_count=100000 116 | ) 117 | 118 | elif sys.argv[-1] == "realtime": 119 | 120 | realtime_headers = [ 121 | "_time", 122 | "tas", 123 | "src_ip", 124 | "src_port", 125 | "dest_ip", 126 | "dest_port", 127 | "src_content", 128 | ] 129 | 130 | data_dir = "./data/" 131 | holder_filename = "payload_holder.tmp" 132 | payload_filename = "payload_data.tmp" 133 | 134 | holder_path = os.path.join(data_dir, holder_filename) 135 | payload_path = os.path.join(data_dir, payload_filename) 136 | 137 | replace_data( 138 | holder_path, 139 | search_query_payload(earliest_minute=-200, latest_minute=-180, headers=realtime_headers), 140 | credentials=credentials, 141 | headers=realtime_headers, 142 | data_count=1000000 143 | ) 144 | shutil.copy(holder_path, payload_path) 145 | -------------------------------------------------------------------------------- /inv_sql/inv_sql_parse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import numpy as np 4 | import gc 5 | import multiprocessing 6 | import pandas 7 | import datetime 8 | import sys 9 | from pandas.errors import EmptyDataError 10 | 11 | from functools import partial 12 | 13 | 14 | def convert_content(content_string, x_dim, pad_before=True): 15 | 16 | int_list = list(map(np.uint8, str(content_string).encode('utf-8')))[:x_dim] 17 | if len(int_list) < x_dim: 18 | if pad_before: 19 | int_list = [np.uint8(0)] * (x_dim - len(int_list)) + int_list # Pad Before 20 | else: 21 | int_list = int_list + [np.uint8(0)] * (x_dim - len(int_list)) # Pad After 22 | 23 | return int_list 24 | 25 | def convert_data(start_index, filename, npy_dir, batch_size, x_dim, pad_before=True, augmentation=1): 26 | 27 | try: 28 | dataframe = pandas.read_csv(filename, 29 | header=0, 30 | usecols=["src_content", "label"], 31 | skiprows=list(range(1, start_index)), 32 | nrows=batch_size, 33 | engine='python') 34 | labels = dataframe["label"].values.astype(np.uint8) 35 | except ValueError: 36 | dataframe = pandas.read_csv(filename, 37 | header=0, 38 | usecols=["src_content"], 39 | skiprows=list(range(1, start_index)), 40 | nrows=batch_size, 41 | engine='python') 42 | labels = np.array([np.uint8(0)] * dataframe.shape[0]) 43 | 44 | labels = labels.reshape((labels.shape[0], 1)) 45 | src_content = list(convert_content(x, x_dim=x_dim, pad_before=pad_before) 46 | for x in dataframe["src_content"].values) 47 | 48 | src_content_aug = src_content 49 | labels_aug = np.concatenate(tuple([labels] * augmentation)) 50 | 51 | for i in range(1, augmentation): 52 | if pad_before: 53 | src_content_aug = src_content_aug + list( 54 | [np.uint8(0)]*i + content[:-i] for content in src_content 55 | ) 56 | else: 57 | src_content_aug = src_content_aug + list( 58 | content[:-i] + [np.uint8(0)] * i for content in src_content 59 | ) 60 | 61 | src_content_aug = np.array(src_content_aug) 62 | file_no = int(start_index / batch_size) 63 | if pad_before: 64 | pad_string = '_prepad' 65 | else: 66 | pad_string = '_postpad' 67 | 68 | basename = os.path.basename(filename) 69 | file_extension_index = basename.rfind('.') 70 | save_basename = basename[:file_extension_index] + pad_string + '_' + str(file_no) + '.npy' 71 | save_filename = os.path.join(npy_dir, save_basename) 72 | np.save(save_filename, np.concatenate((src_content_aug, labels_aug), axis=1)) 73 | gc.collect() 74 | 75 | return 76 | 77 | 78 | def convert_file_list(datafile_list, npy_dir, x_dim=1000, pad_before=True, augmentation=1): 79 | 80 | processors = int(multiprocessing.cpu_count() / 1.5) 81 | line_per_processor = int(1048576 / augmentation) # pow(2, 20) 82 | 83 | for filepath in datafile_list: 84 | if pad_before: 85 | pad_string = '_prepad' 86 | else: 87 | pad_string = '_postpad' 88 | 89 | filename = os.path.basename(filepath) 90 | file_extension_index = filename.rfind('.') 91 | npy_filename = filename[:file_extension_index] + pad_string + '_0.npy' 92 | 93 | if npy_filename in os.listdir(npy_dir): 94 | continue 95 | 96 | try: 97 | df_temp = pandas.read_csv(filepath, header=0, engine='python') 98 | except EmptyDataError: 99 | continue 100 | 101 | row_count = df_temp.shape[0] 102 | del(df_temp) 103 | gc.collect() 104 | 105 | pool = multiprocessing.Pool(processes=processors) 106 | 107 | split_size = int(np.ceil(row_count / line_per_processor)) 108 | index_list = list(range(0, split_size*line_per_processor, line_per_processor)) 109 | 110 | pool.map(partial(convert_data, 111 | filename=filepath, 112 | batch_size=line_per_processor, 113 | x_dim=1000, 114 | pad_before=pad_before, 115 | augmentation=augmentation 116 | ), 117 | index_list) 118 | 119 | pool.close() 120 | pool.join() 121 | gc.collect() 122 | 123 | 124 | if __name__ == "__main__": 125 | yesterday = datetime.datetime.today() + datetime.timedelta(days=-1) 126 | day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2) 127 | yesterday_string = yesterday.strftime("%Y%m%d") 128 | day_before_yesterday_string = day_before_yesterday.strftime("%Y%m%d") 129 | data_dir = "./data/" 130 | npy_dir = "./npy/" 131 | 132 | payload_file_list = list(os.path.join(data_dir, f) 133 | for f in os.listdir(data_dir) 134 | if "payload" in f and day_before_yesterday_string in f) 135 | 136 | sql_file_list = list(os.path.join(data_dir, f) 137 | for f in os.listdir(data_dir) 138 | if "sql" in f and yesterday_string not in f) 139 | 140 | label_file_list = list(os.path.join(data_dir, f) 141 | for f in os.listdir(data_dir) 142 | if "inv-sql" in f and yesterday_string not in f) 143 | 144 | convert_file_list(payload_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=1) 145 | convert_file_list(sql_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=20) 146 | convert_file_list(label_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=20) 147 | -------------------------------------------------------------------------------- /inv_aut/inv_aut_parse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import numpy as np 4 | import gc 5 | import multiprocessing 6 | import pandas 7 | import datetime 8 | import sys 9 | from pandas.errors import EmptyDataError 10 | 11 | from functools import partial 12 | 13 | 14 | 15 | def convert_content(content_string, x_dim, pad_before=True): 16 | 17 | int_list = list(map(np.uint8, str(content_string).encode('utf-8')))[:x_dim] 18 | if len(int_list) < x_dim: 19 | if pad_before: 20 | int_list = [np.uint8(0)] * (x_dim - len(int_list)) + int_list # Pad Before 21 | else: 22 | int_list = int_list + [np.uint8(0)] * (x_dim - len(int_list)) # Pad After 23 | 24 | return int_list 25 | 26 | 27 | def convert_data(start_index, filename, npy_dir, batch_size, x_dim, pad_before=True, augmentation=1): 28 | 29 | try: 30 | dataframe = pandas.read_csv(filename, 31 | header=0, 32 | usecols=["src_content", "label"], 33 | skiprows=list(range(1, start_index)), 34 | nrows=batch_size, 35 | engine='python') 36 | labels = dataframe["label"].values.astype(np.uint8) 37 | except ValueError: 38 | dataframe = pandas.read_csv(filename, 39 | header=0, 40 | usecols=["src_content"], 41 | skiprows=list(range(1, start_index)), 42 | nrows=batch_size, 43 | engine='python') 44 | labels = np.array([np.uint8(0)] * dataframe.shape[0]) 45 | 46 | labels = labels.reshape((labels.shape[0], 1)) 47 | src_content = list(convert_content(x, x_dim=x_dim, pad_before=pad_before) 48 | for x in dataframe["src_content"].values) 49 | 50 | src_content_aug = src_content 51 | labels_aug = np.concatenate(tuple([labels] * augmentation)) 52 | 53 | for i in range(1, augmentation): 54 | if pad_before: 55 | src_content_aug = src_content_aug + list( 56 | [np.uint8(0)]*i + content[:-i] for content in src_content 57 | ) 58 | else: 59 | src_content_aug = src_content_aug + list( 60 | content[:-i] + [np.uint8(0)] * i for content in src_content 61 | ) 62 | 63 | src_content_aug = np.array(src_content_aug) 64 | file_no = int(start_index / batch_size) 65 | if pad_before: 66 | pad_string = '_prepad' 67 | else: 68 | pad_string = '_postpad' 69 | 70 | basename = os.path.basename(filename) 71 | file_extension_index = basename.rfind('.') 72 | save_basename = basename[:file_extension_index] + pad_string + '_' + str(file_no) + '.npy' 73 | save_filename = os.path.join(npy_dir, save_basename) 74 | np.save(save_filename, np.concatenate((src_content_aug, labels_aug), axis=1)) 75 | gc.collect() 76 | 77 | return 78 | 79 | 80 | def convert_file_list(datafile_list, npy_dir, x_dim=1000, pad_before=True, augmentation=1): 81 | 82 | processors = int(multiprocessing.cpu_count() / 1.5) 83 | line_per_processor = int(1048576 / augmentation) # pow(2, 20) 84 | 85 | for filepath in datafile_list: 86 | if pad_before: 87 | pad_string = '_prepad' 88 | else: 89 | pad_string = '_postpad' 90 | 91 | filename = os.path.basename(filepath) 92 | file_extension_index = filename.rfind('.') 93 | npy_filename = filename[:file_extension_index] + pad_string + "_0.npy" 94 | 95 | if npy_filename in os.listdir(npy_dir): # Check already parsed npy existence 96 | continue 97 | 98 | try: 99 | df_temp = pandas.read_csv(filepath, header=0, engine='python') 100 | except EmptyDataError: 101 | continue 102 | 103 | row_count = df_temp.shape[0] 104 | del(df_temp) 105 | gc.collect() 106 | 107 | pool = multiprocessing.Pool(processes=processors) 108 | 109 | split_size = int(np.ceil(row_count / line_per_processor)) 110 | index_list = list(range(0, split_size*line_per_processor, line_per_processor)) 111 | 112 | pool.map(partial(convert_data, 113 | filename=filepath, 114 | npy_dir=npy_dir, 115 | batch_size=line_per_processor, 116 | x_dim=x_dim, 117 | pad_before=pad_before, 118 | augmentation=augmentation 119 | ), 120 | index_list) 121 | 122 | pool.close() 123 | pool.join() 124 | gc.collect() 125 | 126 | 127 | if __name__ == "__main__": 128 | 129 | yesterday = datetime.datetime.today() + datetime.timedelta(days=-1) 130 | day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2) 131 | yesterday_string = yesterday.strftime("%Y%m%d") 132 | day_before_yesterday_string = day_before_yesterday.strftime("%Y%m%d") 133 | data_dir = "./data/" 134 | npy_dir = "./npy/" 135 | 136 | payload_file_list = list(os.path.join(data_dir, f) 137 | for f in os.listdir(data_dir) 138 | if "payload" in f and day_before_yesterday_string in f) 139 | 140 | aut_file_list = list(os.path.join(data_dir, f) 141 | for f in os.listdir(data_dir) 142 | if "aut" in f and yesterday_string not in f) 143 | 144 | label_file_list = list(os.path.join(data_dir, f) 145 | for f in os.listdir(data_dir) 146 | if "INV-AUT" in f and yesterday_string not in f) 147 | 148 | convert_file_list(payload_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=1) 149 | convert_file_list(aut_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=20) 150 | convert_file_list(label_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=20) 151 | 152 | -------------------------------------------------------------------------------- /inv_app/inv_app_crd_parse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import numpy as np 4 | import gc 5 | import multiprocessing 6 | import pandas 7 | import datetime 8 | import sys 9 | from pandas.errors import EmptyDataError 10 | 11 | from functools import partial 12 | 13 | 14 | 15 | def convert_content(content_string, x_dim, pad_before=True): 16 | 17 | int_list = list(map(np.uint8, str(content_string).encode('utf-8')))[:x_dim] 18 | if len(int_list) < x_dim: 19 | if pad_before: 20 | int_list = [np.uint8(0)] * (x_dim - len(int_list)) + int_list # Pad Before 21 | else: 22 | int_list = int_list + [np.uint8(0)] * (x_dim - len(int_list)) # Pad After 23 | 24 | return int_list 25 | 26 | 27 | def convert_data(start_index, filename, npy_dir, batch_size, x_dim, pad_before=True, augmentation=1): 28 | 29 | try: 30 | dataframe = pandas.read_csv(filename, 31 | header=0, 32 | usecols=["src_content", "label"], 33 | skiprows=list(range(1, start_index)), 34 | nrows=batch_size, 35 | engine='python') 36 | labels = dataframe["label"].values.astype(np.uint8) 37 | except ValueError: 38 | dataframe = pandas.read_csv(filename, 39 | header=0, 40 | usecols=["src_content"], 41 | skiprows=list(range(1, start_index)), 42 | nrows=batch_size, 43 | engine='python') 44 | labels = np.array([np.uint8(0)] * dataframe.shape[0]) 45 | 46 | labels = labels.reshape((labels.shape[0], 1)) 47 | src_content = list(convert_content(x, x_dim=x_dim, pad_before=pad_before) 48 | for x in dataframe["src_content"].values) 49 | 50 | src_content_aug = src_content 51 | labels_aug = np.concatenate(tuple([labels] * augmentation)) 52 | 53 | for i in range(1, augmentation): 54 | if pad_before: 55 | src_content_aug = src_content_aug + list( 56 | [np.uint8(0)]*i + content[:-i] for content in src_content 57 | ) 58 | else: 59 | src_content_aug = src_content_aug + list( 60 | content[:-i] + [np.uint8(0)] * i for content in src_content 61 | ) 62 | 63 | src_content_aug = np.array(src_content_aug) 64 | file_no = int(start_index / batch_size) 65 | if pad_before: 66 | pad_string = '_prepad' 67 | else: 68 | pad_string = '_postpad' 69 | 70 | basename = os.path.basename(filename) 71 | file_extension_index = basename.rfind('.') 72 | save_basename = basename[:file_extension_index] + pad_string + '_' + str(file_no) + '.npy' 73 | save_filename = os.path.join(npy_dir, save_basename) 74 | np.save(save_filename, np.concatenate((src_content_aug, labels_aug), axis=1)) 75 | gc.collect() 76 | 77 | return 78 | 79 | 80 | def convert_file_list(datafile_list, npy_dir, x_dim=1000, pad_before=True, augmentation=1): 81 | 82 | processors = int(multiprocessing.cpu_count() / 1.5) 83 | line_per_processor = int(1048576 / augmentation) # pow(2, 20) 84 | 85 | for filepath in datafile_list: 86 | if pad_before: 87 | pad_string = '_prepad' 88 | else: 89 | pad_string = '_postpad' 90 | 91 | filename = os.path.basename(filepath) 92 | file_extension_index = filename.rfind('.') 93 | npy_filename = filename[:file_extension_index] + pad_string + "_0.npy" 94 | 95 | if npy_filename in os.listdir(npy_dir): # Check already parsed npy existence 96 | continue 97 | 98 | try: 99 | df_temp = pandas.read_csv(filepath, header=0, engine='python') 100 | except EmptyDataError: 101 | continue 102 | 103 | row_count = df_temp.shape[0] 104 | del(df_temp) 105 | gc.collect() 106 | 107 | pool = multiprocessing.Pool(processes=processors) 108 | 109 | split_size = int(np.ceil(row_count / line_per_processor)) 110 | index_list = list(range(0, split_size*line_per_processor, line_per_processor)) 111 | 112 | pool.map(partial(convert_data, 113 | filename=filepath, 114 | npy_dir=npy_dir, 115 | batch_size=line_per_processor, 116 | x_dim=x_dim, 117 | pad_before=pad_before, 118 | augmentation=augmentation 119 | ), 120 | index_list) 121 | 122 | pool.close() 123 | pool.join() 124 | gc.collect() 125 | 126 | 127 | if __name__ == "__main__": 128 | 129 | yesterday = datetime.datetime.today() + datetime.timedelta(days=-1) 130 | day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2) 131 | yesterday_string = yesterday.strftime("%Y%m%d") 132 | day_before_yesterday_string = day_before_yesterday.strftime("%Y%m%d") 133 | data_dir = "./data/" 134 | npy_dir = "./npy/" 135 | 136 | payload_file_list = list(os.path.join(data_dir, f) 137 | for f in os.listdir(data_dir) 138 | if "payload" in f and day_before_yesterday_string in f) 139 | 140 | app_file_list = list(os.path.join(data_dir, f) 141 | for f in os.listdir(data_dir) 142 | if "app" in f and yesterday_string not in f) 143 | 144 | label_file_list = list(os.path.join(data_dir, f) 145 | for f in os.listdir(data_dir) 146 | if "INV-APP" in f and yesterday_string not in f) 147 | 148 | convert_file_list(payload_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=1) 149 | convert_file_list(app_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=20) 150 | convert_file_list(label_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=20) 151 | 152 | -------------------------------------------------------------------------------- /inv_sql/inv_sql_train.py: -------------------------------------------------------------------------------- 1 | import keras.utils 2 | import numpy as np 3 | import datetime 4 | import os 5 | import gc 6 | import keras.backend.tensorflow_backend as keras_tf_backend 7 | import tensorflow as tf 8 | import psutil 9 | import threading 10 | import shutil 11 | 12 | from copy import deepcopy 13 | from keras.models import Sequential 14 | from keras.layers import Conv1D, MaxPooling1D 15 | from keras.layers import LeakyReLU, BatchNormalization 16 | from keras.layers import Dropout 17 | from keras.layers import LSTM, Dense, Bidirectional, CuDNNLSTM 18 | 19 | class DataSequence(keras.utils.Sequence): 20 | 21 | def _find_npy_size(self, filename): 22 | with open(filename, 'rb') as f: 23 | data = str(f.read(100)[50:]) 24 | shape_index = data.find('shape') 25 | comma_index = data[shape_index:].find(',') 26 | 27 | return int(data[shape_index+9 : shape_index+comma_index]) 28 | 29 | def _load_cache(self, label, idx): 30 | 31 | cache_end_index = self._cache_end_index_dict[label] 32 | cache_start_index = self._cache_start_index_dict[label] 33 | if (idx + 1) * self._batch_size_dict[label] <= cache_end_index: 34 | return 35 | 36 | threshold = self._cache_threshold_dict[label] 37 | batch_size = self._batch_size_dict[label] 38 | 39 | delete_count = int((idx - 64) * batch_size) - cache_start_index 40 | if delete_count > 0: 41 | self._cache_dict[label] = self._cache_dict[label][delete_count:] 42 | self._cache_start_index_dict[label] = self._cache_start_index_dict[label] + delete_count 43 | 44 | new_row_count = self._cache_dict[label].shape[0] 45 | dict_size = self._cache_dict[label].nbytes 46 | if new_row_count > 0: 47 | self._avg_size = float(dict_size / new_row_count) 48 | 49 | remaining_size = threshold - dict_size 50 | temp_array_filename_list = [] 51 | while self._file_size_list_dict[label]: 52 | file_name = self._file_list_dict[label].pop(0) 53 | file_size = self._file_size_list_dict[label].pop(0) 54 | temp_array_filename_list.append(file_name) 55 | remaining_size = remaining_size - file_size * self._avg_size 56 | self._cache_end_index_dict[label] = self._cache_end_index_dict[label] + file_size 57 | 58 | if not self._file_size_list_dict[label]: 59 | break 60 | if remaining_size <= self._file_size_list_dict[label][0] * self._avg_size: 61 | break 62 | 63 | self._cache_dict[label] = np.concatenate(tuple([self._cache_dict[label]] + list(np.load(x) for x in temp_array_filename_list))) 64 | gc.collect() 65 | 66 | return 67 | 68 | def _initialize_objects(self): 69 | self._file_list_dict = deepcopy(self._backup_file_list_dict) 70 | self._cache_dict = dict() 71 | gc.collect() 72 | 73 | for label in self._file_list_dict: 74 | self._file_size_list_dict[label] = [] 75 | self._cache_dict[label] = np.empty((0, self._x_dim+1), dtype=np.uint8) 76 | self._cache_start_index_dict[label] = 0 77 | self._cache_end_index_dict[label] = 0 78 | for filename in self._file_list_dict[label]: 79 | npy_size = self._find_npy_size(filename) 80 | self._file_size_list_dict[label].append(npy_size) 81 | 82 | for label in self._cache_dict: 83 | self._load_cache(label, 0) 84 | 85 | def __init__(self, file_list_dict, x_dim, batch_size=1024, mem_share=0.2): 86 | 87 | self._backup_file_list_dict = deepcopy(file_list_dict) 88 | self._batch_size = batch_size 89 | self._x_dim = x_dim 90 | self._avg_size = (x_dim + 1) 91 | self._data_size = 0 92 | self._file_size_list_dict = dict() 93 | self._batch_size_dict = dict() 94 | self._cache_threshold_dict = dict() 95 | self._cache_start_index_dict = dict() 96 | self._cache_end_index_dict = dict() 97 | self._lock_dict = dict() 98 | 99 | mem = psutil.virtual_memory() 100 | buffer = 0.25 101 | 102 | total_threshold = mem.available * mem_share * (1 - buffer) 103 | 104 | for label in file_list_dict: 105 | self._batch_size_dict[label] = 0 106 | self._lock_dict[label] = threading.Lock() 107 | 108 | for filename in file_list_dict[label]: 109 | npy_size = self._find_npy_size(filename) 110 | self._batch_size_dict[label] = self._batch_size_dict[label] + npy_size 111 | self._data_size = self._data_size + npy_size 112 | 113 | for label in self._batch_size_dict: 114 | # _batch_size_dict is not the batch size at this point. It is the total row count of each label. 115 | self._cache_threshold_dict[label] = int(total_threshold * self._batch_size_dict[label] / self._data_size) 116 | 117 | # Dividing with total data size / label size to get the batch size of each label 118 | self._batch_size_dict[label] = float(self._batch_size_dict[label] * self._batch_size / self._data_size) 119 | 120 | self._initialize_objects() 121 | 122 | def __len__(self): 123 | return int(np.ceil(self._data_size / self._batch_size)) 124 | 125 | def __getitem__(self, idx): 126 | 127 | data = np.empty((0, self._x_dim+1)) 128 | for label in self._cache_end_index_dict: 129 | 130 | start_index = int(idx * self._batch_size_dict[label]) 131 | end_index = int((idx + 1) * self._batch_size_dict[label]) 132 | 133 | with self._lock_dict[label]: 134 | if end_index > self._cache_end_index_dict[label]: 135 | self._load_cache(label, idx) 136 | cache_start_index = start_index - self._cache_start_index_dict[label] 137 | 138 | if cache_start_index < 0: 139 | cache_start_index = 0 140 | 141 | cache_end_index = cache_start_index + (end_index - start_index) 142 | data = np.concatenate((data, self._cache_dict[label][cache_start_index:cache_end_index])) 143 | 144 | data = data[:self._batch_size] 145 | np.random.shuffle(data) 146 | 147 | train_x = data[:, :-1].reshape(data.shape[0], self._x_dim, 1) 148 | train_y = data[:, [-1]].reshape(data.shape[0], 1) 149 | 150 | train_x = (train_x - 128.0) / -128.0 151 | 152 | return (train_x, train_y) 153 | 154 | def on_epoch_end(self): 155 | self._initialize_objects() 156 | 157 | 158 | def get_session(gpu_share=0.2, threads=2): 159 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_share) 160 | config = tf.ConfigProto(inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads, gpu_options=gpu_options) 161 | config.gpu_options.allow_growth = True 162 | 163 | return tf.Session(config=config) 164 | 165 | 166 | def initialize_hyperparameters(parameter_dict): 167 | 168 | if parameter_dict is None: 169 | # Default Values 170 | parameter_tuple = (3, 12, 4, 6, 14, 4, 8, 0.5, 0.1) 171 | else: 172 | parameter_tuple = ( 173 | parameter_dict["conv_depth"], 174 | parameter_dict["conv_filter"], 175 | parameter_dict["conv_kernel_width"], 176 | parameter_dict["conv_pool"], 177 | parameter_dict["lstm_units"], 178 | parameter_dict["dense_depth"], 179 | parameter_dict["dense_units"], 180 | parameter_dict["dense_dropout"], 181 | parameter_dict["dense_relu_alpha"] 182 | ) 183 | return parameter_tuple 184 | 185 | 186 | def create_model(input_dim, hyperparameter_dict=None): 187 | 188 | (conv_depth, 189 | conv_filter, 190 | conv_kernel_width, 191 | conv_pool, 192 | lstm_units, 193 | dense_depth, 194 | dense_units, 195 | dense_dropout, 196 | dense_relu_alpha 197 | ) = initialize_hyperparameters(hyperparameter_dict) 198 | 199 | model = Sequential() 200 | 201 | # CNN Layer 202 | for i in range(conv_depth): 203 | conv_filter_size = conv_filter * pow(conv_pool, i) 204 | if i == 0: 205 | model.add(Conv1D(conv_filter_size, 206 | conv_kernel_width, 207 | padding='same', 208 | activation='relu', 209 | input_shape=(input_dim, 1))) 210 | else: 211 | model.add(Conv1D(conv_filter_size, 212 | conv_kernel_width, 213 | padding='same', 214 | activation='relu')) 215 | model.add(MaxPooling1D(pool_size=conv_pool, padding='same')) 216 | model.add(BatchNormalization()) 217 | 218 | # RNN Layer 219 | if conv_depth > 0: 220 | (_, lstm_timesteps, lstm_features) = model.output_shape 221 | lstm_input_shape = (lstm_timesteps, lstm_features) # Get input from CNN 222 | else: 223 | lstm_input_shape = (input_dim, 1) # Starts with RNN 224 | 225 | model.add(CuDNNLSTM(lstm_units, return_sequences=True, input_shape=lstm_input_shape)) 226 | model.add(Bidirectional(CuDNNLSTM(lstm_units))) 227 | 228 | for _ in range(dense_depth): 229 | model.add(Dense(dense_units)) 230 | model.add(Dropout(dense_dropout)) 231 | model.add(LeakyReLU(dense_relu_alpha)) 232 | model.add(BatchNormalization()) 233 | 234 | # Output Layer 235 | model.add(Dense(1, activation='sigmoid')) 236 | 237 | # print(model.summary()) 238 | model.compile(optimizer='adam' , loss='binary_crossentropy', metrics=['accuracy']) 239 | 240 | return model 241 | 242 | 243 | def train_model(model, datafile_dict, x_dim, save_model=False, model_name='default.h5', verbose=0): 244 | 245 | processors = int(psutil.cpu_count() / 1.5) 246 | generator = DataSequence(datafile_dict, x_dim=x_dim, batch_size=8192, mem_share=0.2) 247 | model.fit_generator(generator=generator, 248 | epochs=5, 249 | verbose=verbose, 250 | shuffle=False, 251 | workers=processors, 252 | ) 253 | 254 | if save_model: 255 | model.save(model_name) 256 | 257 | 258 | def test_model(model, test_file_list, x_dim): 259 | 260 | data = np.concatenate(tuple( 261 | list(np.load(filename) for filename in test_file_list) 262 | )) 263 | 264 | test_size = data.shape[0] 265 | 266 | x_test = np.array(data[:, :-1]) 267 | x_test = (x_test - 128.0) / -128.0 268 | x_test = x_test.reshape(test_size, x_dim, 1) # 1000 characters at a time, 1 channel 269 | y_test = data[:, [-1]].reshape(test_size, 1) 270 | y_prediction = model.predict(x=x_test, 271 | batch_size=4096, 272 | verbose=0) 273 | 274 | y_merged = (y_prediction.round()*2 + y_test).flatten() 275 | value, counts = np.unique(y_merged, return_counts=True) 276 | value_str = list(map(lambda x: str(int(x)), value)) 277 | metrics = dict(zip(value_str, counts)) 278 | 279 | for y_merged in ['0', '1', '2', '3']: 280 | if y_merged not in metrics: 281 | metrics[y_merged] = 0 282 | 283 | metrics['TP'] = metrics['3'] 284 | metrics['FP'] = metrics['2'] # prediction is 1, label is 0 (1*2 + 0 = 2) 285 | metrics['FN'] = metrics['1'] 286 | metrics['TN'] = metrics['0'] 287 | try: 288 | metrics['Precision'] = metrics['TP'] / (metrics['TP'] + metrics['FP']) 289 | except ZeroDivisionError: 290 | metrics['Precision'] = 0 291 | metrics['Recall'] = metrics['TP'] / (metrics['TP'] + metrics['FN']) 292 | metrics['F-Score'] = 2 * metrics['Precision'] * metrics['Recall'] /\ 293 | (metrics['Precision'] + metrics['Recall']) 294 | metrics['Accuracy'] = (metrics['TP'] + metrics['TN']) / \ 295 | (metrics['TP'] + metrics['TN'] + metrics['FP'] + metrics['FN']) 296 | 297 | y_test = tf.convert_to_tensor(y_test, np.float32) # Keras Bug 298 | y_prediction = tf.convert_to_tensor(y_prediction, np.float32) # Keras Bug 299 | loss = keras.backend.eval(keras.losses.binary_crossentropy(y_true=y_test, y_pred=y_prediction)) 300 | metrics['Loss'] = np.average(loss) 301 | 302 | return metrics 303 | 304 | 305 | def search_model(datafile_dict, test_file_list, x_dim): 306 | 307 | if not test_file_list: 308 | print('Test File List is Empty!') 309 | return 310 | 311 | with open('results.csvx', 'w') as result_file: 312 | headers = 'conv_depth,conv_filter,conv_kernel_width,conv_pool,lstm_units,' 313 | headers = headers + 'dense_depth,dense_units,dense_dropout,dense_relu_alpha,' 314 | headers = headers + 'loss,accuracy,F-score' 315 | result_file.write(headers + '\n') 316 | 317 | for conv_depth in [3]: 318 | for conv_filter in [4,6,8,10,12]: 319 | for conv_kernel_width in [4, 3]: 320 | for conv_pool in [2, 3, 4, 5, 6]: 321 | for lstm_units in [16,14,12,8]: 322 | for dense_depth in [1,2,3,4,5,6,7,8]: 323 | for dense_units in [16,12,8]: 324 | for dense_dropout in [0.1, 0.3]: 325 | for dense_relu_alpha in [0.1]: 326 | hyper_p_dict = { 327 | "conv_depth" : conv_depth, 328 | "conv_filter" : conv_filter, 329 | "conv_kernel_width" : conv_kernel_width, 330 | "conv_pool" : conv_pool, 331 | "lstm_units" : lstm_units, 332 | "dense_depth" : dense_depth, 333 | "dense_units" : dense_units, 334 | "dense_dropout" : dense_dropout, 335 | "dense_relu_alpha" : dense_relu_alpha 336 | } 337 | if np.random.random() < 0.3: 338 | continue 339 | result_file.write(','.join(map(str, hyper_p_dict.values()))) 340 | result_file.flush() 341 | model = create_model(x_dim, hyper_p_dict) 342 | train_model(model, 343 | datafile_dict, 344 | x_dim=x_dim, 345 | save_model=False, 346 | verbose=0) 347 | metrics = test_model(model, test_file_list, x_dim=x_dim) 348 | result_file.write(',' + str(metrics['Loss']) + 349 | ',' + str(metrics['Accuracy']) + 350 | ',' + str(metrics['F-Score']) + '\n') 351 | result_file.flush() 352 | keras_tf_backend.clear_session() 353 | del(model) 354 | processors = int(psutil.cpu_count() / 1.5) 355 | keras_tf_backend.set_session(get_session(0.5, processors)) 356 | gc.collect() 357 | 358 | 359 | if __name__ == '__main__': 360 | 361 | processors = int(psutil.cpu_count() / 1.5) 362 | keras_tf_backend.set_session(get_session(0.5, processors)) 363 | 364 | yesterday = datetime.datetime.today() + datetime.timedelta(days=-1) 365 | day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2) 366 | yesterday_string = yesterday.strftime("%Y%m%d") 367 | day_before_yesterday_string = day_before_yesterday.strftime("%Y%m%d") 368 | npy_dir = "./npy/" 369 | model_dir = "./models/" 370 | pad_string = "prepad" 371 | 372 | model_category = "INV-SQL" 373 | model_type = "CDRNN-" + pad_string 374 | model_name = model_category + "-" + model_type 375 | model_filename = model_name + ".h5" 376 | 377 | model_path = os.path.join(model_dir, model_filename) 378 | model_backup_filename = model_filename + "." + yesterday_string 379 | model_backup_path = os.path.join(model_dir, model_backup_filename) 380 | 381 | model = create_model(1000) 382 | 383 | payload_files = list(os.path.join(npy_dir, f) 384 | for f in os.listdir(npy_dir) 385 | if "_payload_" + pad_string in f 386 | and day_before_yesterday_string in f) 387 | 388 | sql_files = list(os.path.join(npy_dir, f) 389 | for f in os.listdir(npy_dir) 390 | if "_sql_" + pad_string in f 391 | and yesterday_string not in f) 392 | 393 | label_files = list(os.path.join(npy_dir, f) 394 | for f in os.listdir(npy_dir) 395 | if model_name in f) 396 | 397 | if os.path.exists(model_path): 398 | shutil.copy(model_path, model_backup_path) 399 | 400 | train_model(model, {'payload': payload_files, 'sql': sql_files, 'label': label_files}, 401 | x_dim=1000, save_model=True, model_name=model_path, verbose=0) 402 | 403 | 404 | 405 | 406 | -------------------------------------------------------------------------------- /inv_aut/inv_aut_train.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import gc 3 | import threading 4 | import os 5 | import shutil 6 | from copy import deepcopy 7 | 8 | import keras.backend.tensorflow_backend as keras_tf_backend 9 | import keras.utils 10 | import numpy as np 11 | import psutil 12 | import tensorflow as tf 13 | from keras.layers import Conv1D, MaxPooling1D 14 | from keras.layers import Dropout 15 | from keras.layers import LSTM, Dense, Bidirectional, CuDNNLSTM 16 | from keras.layers import LeakyReLU, BatchNormalization 17 | from keras.models import Sequential 18 | 19 | 20 | class DataSequence(keras.utils.Sequence): 21 | 22 | def _find_npy_size(self, filename): 23 | with open(filename, 'rb') as f: 24 | data = str(f.read(100)[50:]) 25 | shape_index = data.find('shape') 26 | comma_index = data[shape_index:].find(',') 27 | 28 | return int(data[shape_index+9 : shape_index+comma_index]) 29 | 30 | def _load_cache(self, label, idx): 31 | 32 | cache_end_index = self._cache_end_index_dict[label] 33 | cache_start_index = self._cache_start_index_dict[label] 34 | if (idx + 1) * self._batch_size_dict[label] <= cache_end_index: 35 | return 36 | 37 | threshold = self._cache_threshold_dict[label] 38 | batch_size = self._batch_size_dict[label] 39 | 40 | delete_count = int((idx - 64) * batch_size) - cache_start_index 41 | if delete_count > 0: 42 | self._cache_dict[label] = self._cache_dict[label][delete_count:] 43 | self._cache_start_index_dict[label] = self._cache_start_index_dict[label] + delete_count 44 | 45 | new_row_count = self._cache_dict[label].shape[0] 46 | dict_size = self._cache_dict[label].nbytes 47 | if new_row_count > 0: 48 | self._avg_size = float(dict_size / new_row_count) 49 | 50 | remaining_size = threshold - dict_size 51 | temp_array_filename_list = [] 52 | while self._file_size_list_dict[label]: 53 | file_name = self._file_list_dict[label].pop(0) 54 | file_size = self._file_size_list_dict[label].pop(0) 55 | temp_array_filename_list.append(file_name) 56 | remaining_size = remaining_size - file_size * self._avg_size 57 | self._cache_end_index_dict[label] = self._cache_end_index_dict[label] + file_size 58 | 59 | if not self._file_size_list_dict[label]: 60 | break 61 | if remaining_size <= self._file_size_list_dict[label][0] * self._avg_size: 62 | break 63 | 64 | self._cache_dict[label] = np.concatenate(tuple([self._cache_dict[label]] + 65 | list(np.load(x) for x in temp_array_filename_list) 66 | ) 67 | ) 68 | gc.collect() 69 | 70 | return 71 | 72 | def _initialize_objects(self): 73 | self._file_list_dict = deepcopy(self._backup_file_list_dict) 74 | self._cache_dict = dict() 75 | gc.collect() 76 | 77 | for label in self._file_list_dict: 78 | self._file_size_list_dict[label] = [] 79 | self._cache_dict[label] = np.empty((0, self._x_dim+1), dtype=np.uint8) 80 | self._cache_start_index_dict[label] = 0 81 | self._cache_end_index_dict[label] = 0 82 | for filename in self._file_list_dict[label]: 83 | npy_size = self._find_npy_size(filename) 84 | self._file_size_list_dict[label].append(npy_size) 85 | 86 | for label in self._cache_dict: 87 | self._load_cache(label, 0) 88 | 89 | def __init__(self, file_list_dict, x_dim, batch_size=1024, mem_share=0.2): 90 | 91 | self._backup_file_list_dict = deepcopy(file_list_dict) 92 | self._batch_size = batch_size 93 | self._x_dim = x_dim 94 | self._avg_size = (x_dim + 1) 95 | self._data_size = 0 96 | self._file_size_list_dict = dict() 97 | self._batch_size_dict = dict() 98 | self._cache_threshold_dict = dict() 99 | self._cache_start_index_dict = dict() 100 | self._cache_end_index_dict = dict() 101 | self._lock_dict = dict() 102 | 103 | mem = psutil.virtual_memory() 104 | buffer = 0.25 105 | 106 | total_threshold = mem.available * mem_share * (1 - buffer) 107 | 108 | for label in file_list_dict: 109 | self._batch_size_dict[label] = 0 110 | self._lock_dict[label] = threading.Lock() 111 | 112 | for filename in file_list_dict[label]: 113 | npy_size = self._find_npy_size(filename) 114 | self._batch_size_dict[label] = self._batch_size_dict[label] + npy_size 115 | self._data_size = self._data_size + npy_size 116 | 117 | for label in self._batch_size_dict: 118 | # _batch_size_dict is not the batch size at this point. It is the total row count of each label. 119 | self._cache_threshold_dict[label] = int(total_threshold * self._batch_size_dict[label] / self._data_size) 120 | 121 | # Dividing with total data size / label size to get the batch size of each label 122 | self._batch_size_dict[label] = float(self._batch_size_dict[label] * self._batch_size / self._data_size) 123 | 124 | self._initialize_objects() 125 | 126 | def __len__(self): 127 | return int(np.ceil(self._data_size / self._batch_size)) 128 | 129 | def __getitem__(self, idx): 130 | 131 | data = np.empty((0, self._x_dim+1)) 132 | for label in self._cache_end_index_dict: 133 | 134 | start_index = int(idx * self._batch_size_dict[label]) 135 | end_index = int((idx + 1) * self._batch_size_dict[label]) 136 | 137 | with self._lock_dict[label]: 138 | if end_index > self._cache_end_index_dict[label]: 139 | self._load_cache(label, idx) 140 | cache_start_index = start_index - self._cache_start_index_dict[label] 141 | 142 | if cache_start_index < 0: 143 | cache_start_index = 0 144 | 145 | cache_end_index = cache_start_index + (end_index - start_index) 146 | data = np.concatenate((data, self._cache_dict[label][cache_start_index:cache_end_index])) 147 | 148 | data = data[:self._batch_size] 149 | np.random.shuffle(data) 150 | 151 | train_x = data[:, :-1].reshape(data.shape[0], self._x_dim, 1) 152 | train_y = data[:, [-1]].reshape(data.shape[0], 1) 153 | 154 | train_x = (train_x - 128.0) / -128.0 155 | 156 | return (train_x, train_y) 157 | 158 | def on_epoch_end(self): 159 | self._initialize_objects() 160 | 161 | 162 | def get_session(gpu_share=0.2, threads=2): 163 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_share) 164 | config = tf.ConfigProto(inter_op_parallelism_threads=threads, 165 | intra_op_parallelism_threads=threads, 166 | gpu_options=gpu_options) 167 | config.gpu_options.allow_growth = True 168 | 169 | return tf.Session(config=config) 170 | 171 | 172 | def initialize_hyperparameters(parameter_dict): 173 | 174 | if parameter_dict is None: 175 | # Default Values 176 | parameter_tuple = (2, 12, 4, 5, 16, 8, 12, 0.1, 0.1) 177 | else: 178 | parameter_tuple = ( 179 | parameter_dict["conv_depth"], 180 | parameter_dict["conv_filter"], 181 | parameter_dict["conv_kernel_width"], 182 | parameter_dict["conv_pool"], 183 | parameter_dict["lstm_units"], 184 | parameter_dict["dense_depth"], 185 | parameter_dict["dense_units"], 186 | parameter_dict["dense_dropout"], 187 | parameter_dict["dense_relu_alpha"] 188 | ) 189 | return parameter_tuple 190 | 191 | 192 | def create_model(input_dim, hyperparameter_dict=None): 193 | 194 | (conv_depth, 195 | conv_filter, 196 | conv_kernel_width, 197 | conv_pool, 198 | lstm_units, 199 | dense_depth, 200 | dense_units, 201 | dense_dropout, 202 | dense_relu_alpha 203 | ) = initialize_hyperparameters(hyperparameter_dict) 204 | 205 | model = Sequential() 206 | 207 | # CNN Layer 208 | for i in range(conv_depth): 209 | conv_filter_size = conv_filter * pow(conv_pool, i) 210 | if i == 0: 211 | model.add(Conv1D(conv_filter_size, 212 | conv_kernel_width, 213 | padding='same', 214 | activation='relu', 215 | input_shape=(input_dim, 1))) 216 | else: 217 | model.add(Conv1D(conv_filter_size, 218 | conv_kernel_width, 219 | padding='same', 220 | activation='relu')) 221 | model.add(MaxPooling1D(pool_size=conv_pool, padding='same')) 222 | model.add(BatchNormalization()) 223 | 224 | # RNN Layer 225 | if conv_depth > 0: 226 | (_, lstm_timesteps, lstm_features) = model.output_shape 227 | lstm_input_shape = (lstm_timesteps, lstm_features) # Get input from CNN 228 | else: 229 | lstm_input_shape = (input_dim, 1) # Starts with RNN 230 | 231 | #model.add(Bidirectional(LSTM(lstm_units, return_sequences=True), input_shape=lstm_input_shape)) 232 | #model.add(Bidirectional(LSTM(lstm_units))) 233 | model.add(LSTM(lstm_units, return_sequences=True, input_shape=lstm_input_shape)) 234 | #model.add(CuDNNLSTM(lstm_units, return_sequences=True, input_shape=lstm_input_shape)) 235 | #model.add(CuDNNLSTM(lstm_units, return_sequences=True)) 236 | #model.add(CuDNNLSTM(lstm_units)) 237 | #model.add(CuDNNLSTM(lstm_units, input_shape=lstm_input_shape)) 238 | #model.add(Bidirectional(CuDNNLSTM(lstm_units), input_shape=lstm_input_shape)) 239 | model.add(Bidirectional(LSTM(lstm_units))) 240 | #model.add(Bidirectional(CuDNNLSTM(lstm_units))) 241 | 242 | # DNN Layer 243 | for _ in range(dense_depth): 244 | model.add(Dense(dense_units)) 245 | model.add(Dropout(dense_dropout)) 246 | model.add(LeakyReLU(dense_relu_alpha)) 247 | model.add(BatchNormalization()) 248 | 249 | # Output Layer 250 | model.add(Dense(1, activation='sigmoid')) 251 | 252 | # print(model.summary()) 253 | model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 254 | 255 | return model 256 | 257 | 258 | def train_model(model, datafile_dict, x_dim, save_model=False, model_name='default.h5', verbose=0): 259 | 260 | processors = int(psutil.cpu_count() / 1.5) 261 | generator = DataSequence(datafile_dict, x_dim=x_dim, batch_size=8192, mem_share=0.2) 262 | model.fit_generator(generator=generator, 263 | epochs=5, 264 | verbose=verbose, 265 | shuffle=False, 266 | workers=processors 267 | ) 268 | 269 | if save_model: 270 | model.save(model_name) 271 | 272 | 273 | def test_model(model, test_file_list, x_dim): 274 | 275 | data = np.concatenate(tuple( 276 | list(np.load(filename) for filename in test_file_list) 277 | )) 278 | 279 | test_size = data.shape[0] 280 | 281 | x_test = np.array(data[:, :-1]) 282 | x_test = (x_test - 128.0) / -128.0 283 | x_test = x_test.reshape(test_size, x_dim, 1) # 1000 characters at a time, 1 channel 284 | y_test = data[:, [-1]].reshape(test_size, 1) 285 | y_prediction = model.predict(x=x_test, 286 | batch_size=4096, 287 | verbose=0) 288 | 289 | y_merged = (y_prediction.round()*2 + y_test).flatten() 290 | value, counts = np.unique(y_merged, return_counts=True) 291 | value_str = list(map(lambda x: str(int(x)), value)) 292 | metrics = dict(zip(value_str, counts)) 293 | 294 | for y_merged in ['0', '1', '2', '3']: 295 | if y_merged not in metrics: 296 | metrics[y_merged] = 0 297 | 298 | metrics['TP'] = metrics['3'] 299 | metrics['FP'] = metrics['2'] # prediction is 1, label is 0 (1*2 + 0 = 2) 300 | metrics['FN'] = metrics['1'] 301 | metrics['TN'] = metrics['0'] 302 | metrics['Precision'] = metrics['TP'] / (metrics['TP'] + metrics['FP']) 303 | metrics['Recall'] = metrics['TP'] / (metrics['TP'] + metrics['FN']) 304 | metrics['F-Score'] = 2 * metrics['Precision'] * metrics['Recall'] /\ 305 | (metrics['Precision'] + metrics['Recall']) 306 | metrics['Accuracy'] = (metrics['TP'] + metrics['TN']) / \ 307 | (metrics['TP'] + metrics['TN'] + metrics['FP'] + metrics['FN']) 308 | 309 | y_test = tf.convert_to_tensor(y_test, np.float32) # Keras Bug 310 | y_prediction = tf.convert_to_tensor(y_prediction, np.float32) # Keras Bug 311 | loss = keras.backend.eval(keras.losses.binary_crossentropy(y_true=y_test, y_pred=y_prediction)) 312 | metrics['Loss'] = np.average(loss) 313 | 314 | return metrics 315 | 316 | 317 | def search_model(datafile_dict, test_file_list, x_dim): 318 | 319 | if not test_file_list: 320 | print('Test File List is Empty!') 321 | return 322 | 323 | with open('results.csvx', 'w') as result_file: 324 | headers = 'conv_depth,conv_filter,conv_kernel_width,conv_pool,lstm_units,' 325 | headers = headers + 'dense_depth,dense_units,dense_dropout,dense_relu_alpha,' 326 | headers = headers + 'loss,accuracy,F-score' 327 | result_file.write(headers + '\n') 328 | 329 | for conv_depth in [1, 2, 3]: 330 | for conv_filter in [4, 6]: 331 | for conv_kernel_width in [4, 3]: 332 | for conv_pool in [2, 3]: 333 | for lstm_units in [16, 8]: 334 | for dense_depth in [1]: 335 | for dense_units in [16, 8]: 336 | for dense_dropout in [0.1, 0.3]: 337 | for dense_relu_alpha in [0.1]: 338 | hyper_p_dict = { 339 | "conv_depth" : conv_depth, 340 | "conv_filter" : conv_filter, 341 | "conv_kernel_width" : conv_kernel_width, 342 | "conv_pool" : conv_pool, 343 | "lstm_units" : lstm_units, 344 | "dense_depth" : dense_depth, 345 | "dense_units" : dense_units, 346 | "dense_dropout" : dense_dropout, 347 | "dense_relu_alpha" : dense_relu_alpha 348 | } 349 | if np.random.random() < 0.3: 350 | continue 351 | result_file.write(','.join(map(str, hyper_p_dict.values()))) 352 | result_file.flush() 353 | model = create_model(x_dim, hyper_p_dict) 354 | train_model(model, 355 | datafile_dict, 356 | x_dim=x_dim, 357 | save_model=False, 358 | verbose=0) 359 | metrics = test_model(model, test_file_list, x_dim=x_dim) 360 | result_file.write(',' + str(metrics['Loss']) + 361 | ',' + str(metrics['Accuracy']) + 362 | ',' + str(metrics['F-Score']) + '\n') 363 | result_file.flush() 364 | keras_tf_backend.clear_session() 365 | del(model) 366 | processors = int(psutil.cpu_count() / 1.5) 367 | keras_tf_backend.set_session(get_session(0.5, processors)) 368 | gc.collect() 369 | 370 | 371 | if __name__ == '__main__': 372 | 373 | processors = int(psutil.cpu_count() / 1.5) 374 | keras_tf_backend.set_session(get_session(0.5, processors)) 375 | 376 | yesterday = datetime.datetime.today() + datetime.timedelta(days=-1) 377 | day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2) 378 | yesterday_string = yesterday.strftime("%Y%m%d") 379 | day_before_yesterday_string = day_before_yesterday.strftime("%Y%m%d") 380 | npy_dir = "./npy/" 381 | model_dir = "./models/" 382 | pad_string = "prepad" 383 | 384 | model_category = "INV-AUT" 385 | model_type = "CRDNN-" + pad_string 386 | model_name = model_category + "-" + model_type 387 | model_filename = model_name + ".h5" 388 | 389 | model_path = os.path.join(model_dir, model_filename) 390 | model_backup_filename = model_filename + "." + yesterday_string 391 | model_backup_path = os.path.join(model_dir, model_backup_filename) 392 | 393 | model = create_model(1000) 394 | 395 | payload_files = list(os.path.join(npy_dir, f) 396 | for f in os.listdir(npy_dir) 397 | if "_payload_" + pad_string in f 398 | and day_before_yesterday_string in f) 399 | 400 | aut_files = list(os.path.join(npy_dir, f) 401 | for f in os.listdir(npy_dir) 402 | if "_aut_" + pad_string in f 403 | and yesterday_string not in f) 404 | 405 | label_files = list(os.path.join(npy_dir, f) 406 | for f in os.listdir(npy_dir) 407 | if model_name + "_" + pad_string in f) 408 | 409 | if os.path.exists(model_path): 410 | shutil.copy(model_path, model_backup_path) 411 | 412 | train_model(model, {'payload': payload_files, 'aut': aut_files, 'label': label_files}, 413 | x_dim=1000, save_model=True, model_name=model_path, verbose=0) 414 | 415 | -------------------------------------------------------------------------------- /inv_app/inv_app_crd_train.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import gc 3 | import threading 4 | import os 5 | import shutil 6 | from copy import deepcopy 7 | 8 | import keras.backend.tensorflow_backend as keras_tf_backend 9 | import keras.utils 10 | import numpy as np 11 | import psutil 12 | import tensorflow as tf 13 | from keras.layers import Conv1D, MaxPooling1D 14 | from keras.layers import Dropout 15 | from keras.layers import LSTM, Dense, Bidirectional, CuDNNLSTM 16 | from keras.layers import LeakyReLU, BatchNormalization 17 | from keras.models import Sequential 18 | #from keras.utils.vis_utils import plot_model 19 | 20 | class DataSequence(keras.utils.Sequence): 21 | 22 | def _find_npy_size(self, filename): 23 | with open(filename, 'rb') as f: 24 | data = str(f.read(100)[50:]) 25 | shape_index = data.find('shape') 26 | comma_index = data[shape_index:].find(',') 27 | 28 | return int(data[shape_index+9 : shape_index+comma_index]) 29 | 30 | def _load_cache(self, label, idx): 31 | 32 | cache_end_index = self._cache_end_index_dict[label] 33 | cache_start_index = self._cache_start_index_dict[label] 34 | if (idx + 1) * self._batch_size_dict[label] <= cache_end_index: 35 | return 36 | 37 | threshold = self._cache_threshold_dict[label] 38 | batch_size = self._batch_size_dict[label] 39 | 40 | delete_count = int((idx - 64) * batch_size) - cache_start_index 41 | if delete_count > 0: 42 | self._cache_dict[label] = self._cache_dict[label][delete_count:] 43 | self._cache_start_index_dict[label] = self._cache_start_index_dict[label] + delete_count 44 | 45 | new_row_count = self._cache_dict[label].shape[0] 46 | dict_size = self._cache_dict[label].nbytes 47 | if new_row_count > 0: 48 | self._avg_size = float(dict_size / new_row_count) 49 | 50 | remaining_size = threshold - dict_size 51 | temp_array_filename_list = [] 52 | while self._file_size_list_dict[label]: 53 | file_name = self._file_list_dict[label].pop(0) 54 | file_size = self._file_size_list_dict[label].pop(0) 55 | temp_array_filename_list.append(file_name) 56 | remaining_size = remaining_size - file_size * self._avg_size 57 | self._cache_end_index_dict[label] = self._cache_end_index_dict[label] + file_size 58 | 59 | if not self._file_size_list_dict[label]: 60 | break 61 | if remaining_size <= self._file_size_list_dict[label][0] * self._avg_size: 62 | break 63 | 64 | self._cache_dict[label] = np.concatenate(tuple([self._cache_dict[label]] + 65 | list(np.load(x) for x in temp_array_filename_list) 66 | ) 67 | ) 68 | gc.collect() 69 | 70 | return 71 | 72 | def _initialize_objects(self): 73 | self._file_list_dict = deepcopy(self._backup_file_list_dict) 74 | self._cache_dict = dict() 75 | gc.collect() 76 | 77 | for label in self._file_list_dict: 78 | self._file_size_list_dict[label] = [] 79 | self._cache_dict[label] = np.empty((0, self._x_dim+1), dtype=np.uint8) 80 | self._cache_start_index_dict[label] = 0 81 | self._cache_end_index_dict[label] = 0 82 | for filename in self._file_list_dict[label]: 83 | npy_size = self._find_npy_size(filename) 84 | self._file_size_list_dict[label].append(npy_size) 85 | 86 | for label in self._cache_dict: 87 | self._load_cache(label, 0) 88 | 89 | def __init__(self, file_list_dict, x_dim, batch_size=1024, mem_share=0.2): 90 | 91 | self._backup_file_list_dict = deepcopy(file_list_dict) 92 | self._batch_size = batch_size 93 | self._x_dim = x_dim 94 | self._avg_size = (x_dim + 1) 95 | self._data_size = 0 96 | self._file_size_list_dict = dict() 97 | self._batch_size_dict = dict() 98 | self._cache_threshold_dict = dict() 99 | self._cache_start_index_dict = dict() 100 | self._cache_end_index_dict = dict() 101 | self._lock_dict = dict() 102 | 103 | mem = psutil.virtual_memory() 104 | buffer = 0.25 105 | 106 | total_threshold = mem.available * mem_share * (1 - buffer) 107 | 108 | for label in file_list_dict: 109 | self._batch_size_dict[label] = 0 110 | self._lock_dict[label] = threading.Lock() 111 | 112 | for filename in file_list_dict[label]: 113 | npy_size = self._find_npy_size(filename) 114 | self._batch_size_dict[label] = self._batch_size_dict[label] + npy_size 115 | self._data_size = self._data_size + npy_size 116 | 117 | for label in self._batch_size_dict: 118 | # _batch_size_dict is not the batch size at this point. It is the total row count of each label. 119 | self._cache_threshold_dict[label] = int(total_threshold * self._batch_size_dict[label] / self._data_size) 120 | 121 | # Dividing with total data size / label size to get the batch size of each label 122 | self._batch_size_dict[label] = float(self._batch_size_dict[label] * self._batch_size / self._data_size) 123 | 124 | self._initialize_objects() 125 | 126 | def __len__(self): 127 | return int(np.ceil(self._data_size / self._batch_size)) 128 | 129 | def __getitem__(self, idx): 130 | 131 | data = np.empty((0, self._x_dim+1)) 132 | for label in self._cache_end_index_dict: 133 | 134 | start_index = int(idx * self._batch_size_dict[label]) 135 | end_index = int((idx + 1) * self._batch_size_dict[label]) 136 | 137 | with self._lock_dict[label]: 138 | if end_index > self._cache_end_index_dict[label]: 139 | self._load_cache(label, idx) 140 | cache_start_index = start_index - self._cache_start_index_dict[label] 141 | 142 | if cache_start_index < 0: 143 | cache_start_index = 0 144 | 145 | cache_end_index = cache_start_index + (end_index - start_index) 146 | data = np.concatenate((data, self._cache_dict[label][cache_start_index:cache_end_index])) 147 | 148 | data = data[:self._batch_size] 149 | np.random.shuffle(data) 150 | 151 | train_x = data[:, :-1].reshape(data.shape[0], self._x_dim, 1) 152 | train_y = data[:, [-1]].reshape(data.shape[0], 1) 153 | 154 | train_x = (train_x - 128.0) / -128.0 155 | 156 | return (train_x, train_y) 157 | 158 | def on_epoch_end(self): 159 | self._initialize_objects() 160 | 161 | 162 | def get_session(gpu_share=0.2, threads=2): 163 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_share) 164 | config = tf.ConfigProto(inter_op_parallelism_threads=threads, 165 | intra_op_parallelism_threads=threads, 166 | gpu_options=gpu_options) 167 | config.gpu_options.allow_growth = True 168 | 169 | return tf.Session(config=config) 170 | 171 | 172 | def initialize_hyperparameters(parameter_dict): 173 | 174 | if parameter_dict is None: 175 | # Default Values 176 | parameter_tuple = (2, 12, 4, 5, 16, 8, 12, 0.1, 0.1) 177 | else: 178 | parameter_tuple = ( 179 | parameter_dict["conv_depth"], 180 | parameter_dict["conv_filter"], 181 | parameter_dict["conv_kernel_width"], 182 | parameter_dict["conv_pool"], 183 | parameter_dict["lstm_units"], 184 | parameter_dict["dense_depth"], 185 | parameter_dict["dense_units"], 186 | parameter_dict["dense_dropout"], 187 | parameter_dict["dense_relu_alpha"] 188 | ) 189 | return parameter_tuple 190 | 191 | 192 | def create_model(input_dim, hyperparameter_dict=None): 193 | 194 | (conv_depth, 195 | conv_filter, 196 | conv_kernel_width, 197 | conv_pool, 198 | lstm_units, 199 | dense_depth, 200 | dense_units, 201 | dense_dropout, 202 | dense_relu_alpha 203 | ) = initialize_hyperparameters(hyperparameter_dict) 204 | 205 | model = Sequential() 206 | 207 | # CNN Layer 208 | for i in range(conv_depth): 209 | conv_filter_size = conv_filter * pow(conv_pool, i) 210 | if i == 0: 211 | model.add(Conv1D(conv_filter_size, 212 | conv_kernel_width, 213 | padding='same', 214 | activation='relu', 215 | input_shape=(input_dim, 1))) 216 | else: 217 | model.add(Conv1D(conv_filter_size, 218 | conv_kernel_width, 219 | padding='same', 220 | activation='relu')) 221 | model.add(MaxPooling1D(pool_size=conv_pool, padding='same')) 222 | model.add(BatchNormalization()) 223 | 224 | # RNN Layer 225 | if conv_depth > 0: 226 | (_, lstm_timesteps, lstm_features) = model.output_shape 227 | lstm_input_shape = (lstm_timesteps, lstm_features) # Get input from CNN 228 | else: 229 | lstm_input_shape = (input_dim, 1) # Starts with RNN 230 | 231 | #model.add(Bidirectional(LSTM(lstm_units, return_sequences=True), input_shape=lstm_input_shape)) 232 | #model.add(Bidirectional(LSTM(lstm_units))) 233 | model.add(LSTM(lstm_units, return_sequences=True, input_shape=lstm_input_shape)) 234 | #model.add(CuDNNLSTM(lstm_units, return_sequences=True, input_shape=lstm_input_shape)) 235 | #model.add(CuDNNLSTM(lstm_units, return_sequences=True)) 236 | #model.add(CuDNNLSTM(lstm_units)) 237 | #model.add(CuDNNLSTM(lstm_units, input_shape=lstm_input_shape)) 238 | #model.add(Bidirectional(CuDNNLSTM(lstm_units), input_shape=lstm_input_shape)) 239 | model.add(Bidirectional(LSTM(lstm_units))) 240 | #model.add(Bidirectional(CuDNNLSTM(lstm_units))) 241 | 242 | # DNN Layer 243 | for _ in range(dense_depth): 244 | model.add(Dense(dense_units)) 245 | model.add(Dropout(dense_dropout)) 246 | model.add(LeakyReLU(dense_relu_alpha)) 247 | model.add(BatchNormalization()) 248 | 249 | # Output Layer 250 | model.add(Dense(1, activation='sigmoid')) 251 | 252 | print(model.summary()) 253 | model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 254 | #plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True) 255 | return model 256 | 257 | 258 | def train_model(model, datafile_dict, x_dim, save_model=False, model_name='default.h5', verbose=0): 259 | 260 | processors = int(psutil.cpu_count() / 1.5) 261 | generator = DataSequence(datafile_dict, x_dim=x_dim, batch_size=8192, mem_share=0.2) 262 | model.fit_generator(generator=generator, 263 | epochs=5, 264 | verbose=verbose, 265 | shuffle=False, 266 | workers=processors 267 | ) 268 | 269 | if save_model: 270 | model.save(model_name) 271 | 272 | 273 | def test_model(model, test_file_list, x_dim): 274 | 275 | data = np.concatenate(tuple( 276 | list(np.load(filename) for filename in test_file_list) 277 | )) 278 | 279 | test_size = data.shape[0] 280 | 281 | x_test = np.array(data[:, :-1]) 282 | x_test = (x_test - 128.0) / -128.0 283 | x_test = x_test.reshape(test_size, x_dim, 1) # 1000 characters at a time, 1 channel 284 | y_test = data[:, [-1]].reshape(test_size, 1) 285 | y_prediction = model.predict(x=x_test, 286 | batch_size=4096, 287 | verbose=0) 288 | 289 | y_merged = (y_prediction.round()*2 + y_test).flatten() 290 | value, counts = np.unique(y_merged, return_counts=True) 291 | value_str = list(map(lambda x: str(int(x)), value)) 292 | metrics = dict(zip(value_str, counts)) 293 | 294 | for y_merged in ['0', '1', '2', '3']: 295 | if y_merged not in metrics: 296 | metrics[y_merged] = 0 297 | 298 | metrics['TP'] = metrics['3'] 299 | metrics['FP'] = metrics['2'] # prediction is 1, label is 0 (1*2 + 0 = 2) 300 | metrics['FN'] = metrics['1'] 301 | metrics['TN'] = metrics['0'] 302 | metrics['Precision'] = metrics['TP'] / (metrics['TP'] + metrics['FP']) 303 | metrics['Recall'] = metrics['TP'] / (metrics['TP'] + metrics['FN']) 304 | metrics['F-Score'] = 2 * metrics['Precision'] * metrics['Recall'] /\ 305 | (metrics['Precision'] + metrics['Recall']) 306 | metrics['Accuracy'] = (metrics['TP'] + metrics['TN']) / \ 307 | (metrics['TP'] + metrics['TN'] + metrics['FP'] + metrics['FN']) 308 | 309 | y_test = tf.convert_to_tensor(y_test, np.float32) # Keras Bug 310 | y_prediction = tf.convert_to_tensor(y_prediction, np.float32) # Keras Bug 311 | loss = keras.backend.eval(keras.losses.binary_crossentropy(y_true=y_test, y_pred=y_prediction)) 312 | metrics['Loss'] = np.average(loss) 313 | 314 | return metrics 315 | 316 | 317 | def search_model(datafile_dict, test_file_list, x_dim): 318 | 319 | if not test_file_list: 320 | print('Test File List is Empty!') 321 | return 322 | 323 | with open('results.csvx', 'w') as result_file: 324 | headers = 'conv_depth,conv_filter,conv_kernel_width,conv_pool,lstm_units,' 325 | headers = headers + 'dense_depth,dense_units,dense_dropout,dense_relu_alpha,' 326 | headers = headers + 'loss,accuracy,F-score' 327 | result_file.write(headers + '\n') 328 | 329 | for conv_depth in [1, 2, 3]: 330 | for conv_filter in [4, 6]: 331 | for conv_kernel_width in [4, 3]: 332 | for conv_pool in [2, 3]: 333 | for lstm_units in [16, 8]: 334 | for dense_depth in [1]: 335 | for dense_units in [16, 8]: 336 | for dense_dropout in [0.1, 0.3]: 337 | for dense_relu_alpha in [0.1]: 338 | hyper_p_dict = { 339 | "conv_depth" : conv_depth, 340 | "conv_filter" : conv_filter, 341 | "conv_kernel_width" : conv_kernel_width, 342 | "conv_pool" : conv_pool, 343 | "lstm_units" : lstm_units, 344 | "dense_depth" : dense_depth, 345 | "dense_units" : dense_units, 346 | "dense_dropout" : dense_dropout, 347 | "dense_relu_alpha" : dense_relu_alpha 348 | } 349 | if np.random.random() < 0.3: 350 | continue 351 | result_file.write(','.join(map(str, hyper_p_dict.values()))) 352 | result_file.flush() 353 | model = create_model(x_dim, hyper_p_dict) 354 | train_model(model, 355 | datafile_dict, 356 | x_dim=x_dim, 357 | save_model=False, 358 | verbose=0) 359 | metrics = test_model(model, test_file_list, x_dim=x_dim) 360 | result_file.write(',' + str(metrics['Loss']) + 361 | ',' + str(metrics['Accuracy']) + 362 | ',' + str(metrics['F-Score']) + '\n') 363 | result_file.flush() 364 | keras_tf_backend.clear_session() 365 | del(model) 366 | processors = int(psutil.cpu_count() / 1.5) 367 | keras_tf_backend.set_session(get_session(0.5, processors)) 368 | gc.collect() 369 | 370 | 371 | if __name__ == '__main__': 372 | 373 | processors = int(psutil.cpu_count() / 1.5) 374 | keras_tf_backend.set_session(get_session(0.5, processors)) 375 | 376 | yesterday = datetime.datetime.today() + datetime.timedelta(days=-1) 377 | day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2) 378 | yesterday_string = yesterday.strftime("%Y%m%d") 379 | day_before_yesterday_string = day_before_yesterday.strftime("%Y%m%d") 380 | npy_dir = "./npy/" 381 | model_dir = "./models/" 382 | pad_string = "prepad" 383 | 384 | model_category = "INV-APP" 385 | model_type = "CRDNN-" + pad_string 386 | model_name = model_category + "-" + model_type 387 | model_filename = model_name + ".h5" 388 | 389 | model_path = os.path.join(model_dir, model_filename) 390 | model_backup_filename = model_filename + "." + yesterday_string 391 | model_backup_path = os.path.join(model_dir, model_backup_filename) 392 | 393 | model = create_model(1000) 394 | 395 | payload_files = list(os.path.join(npy_dir, f) 396 | for f in os.listdir(npy_dir) 397 | if "_payload_" + pad_string in f 398 | and day_before_yesterday_string in f) 399 | 400 | app_files = list(os.path.join(npy_dir, f) 401 | for f in os.listdir(npy_dir) 402 | if "_app_" + pad_string in f 403 | and yesterday_string not in f) 404 | 405 | label_files = list(os.path.join(npy_dir, f) 406 | for f in os.listdir(npy_dir) 407 | if model_name + "_" + pad_string in f) 408 | 409 | if os.path.exists(model_path): 410 | shutil.copy(model_path, model_backup_path) 411 | 412 | #train_model(model, {'payload': payload_files, 'app': app_files, 'label': label_files}, x_dim=1000, save_model=True, model_name=model_path, verbose=0) 413 | train_model(model, {'payload': payload_files, 'app': app_files, 'label': label_files}, x_dim=1000, save_model=True, model_name='INV-APP-CRDNN.h5', verbose=0) 414 | 415 | 416 | #test_model(model, test_file_list, x_dim=1000) 417 | 418 | 419 | #search_model({'payload': payload_files, 'app': app_files, 'label': label_files}, test_file_list, x_dim=1000) 420 | 421 | 422 | 423 | 424 | --------------------------------------------------------------------------------