├── inv_app
    ├── __init__.py
    ├── build.sh
    ├── inv_app_crd_parse.Dockerfile
    ├── inv_app_crd_train.Dockerfile
    ├── inv_app_crd_parse.py
    └── inv_app_crd_train.py
├── inv_aut
    ├── __init__.py
    ├── build.sh
    ├── inv_aut_train.Dockerfile
    ├── inv_aut_parse.Dockerfile
    ├── inv_aut_parse.py
    └── inv_aut_train.py
├── clean_image.sh
├── data_split.Dockerfile
├── inv_sql
    ├── inv_sql_train.Dockerfile
    ├── build.sh
    ├── inv_sql_parse.Dockerfile
    ├── inv_sql_parse.py
    └── inv_sql_train.py
├── label_save.Dockerfile
├── data_save.Dockerfile
├── realtime_save.Dockerfile
├── data_backup.Dockerfile
├── data_backup.py
├── LICENSE
├── data_split.py
├── README.md
├── fsi_splunk.py
├── splunk_queries.py
└── data_save.py


/inv_app/__init__.py:
--------------------------------------------------------------------------------
1 | import data_parse, data_save


--------------------------------------------------------------------------------
/inv_aut/__init__.py:
--------------------------------------------------------------------------------
1 | import data_parse, data_save


--------------------------------------------------------------------------------
/clean_image.sh:
--------------------------------------------------------------------------------
1 | #/bin/bash
2 | docker rmi $(docker images -f dangling=true -q)


--------------------------------------------------------------------------------
/inv_app/build.sh:
--------------------------------------------------------------------------------
1 | #/bin/bash
2 | docker build -f inv_app_parse.Dockerfile -t aisec:inv_app_parse .
3 | docker build -f inv_app_crd_train.Dockerfile -t aisec:inv_app_crd_train .


--------------------------------------------------------------------------------
/data_split.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ai:python-common
2 | MAINTAINER Mohyun Park <mhpark@fsec.or.kr>
3 | 
4 | RUN mkdir /home/dockeruser/data
5 | COPY data_split.py /home/dockeruser/data_split.py
6 | 
7 | CMD python3 data_split.py
8 | 


--------------------------------------------------------------------------------
/inv_sql/inv_sql_train.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ai:python-keras
2 | 
3 | RUN mkdir /home/dockeruser/npy
4 | RUN mkdir /home/dockeruser/models
5 | 
6 | COPY inv_sql_train.py /home/dockeruser/train.py
7 | 
8 | CMD python3 train.py
9 | 


--------------------------------------------------------------------------------
/inv_sql/build.sh:
--------------------------------------------------------------------------------
1 | #/bin/bash
2 | docker build -f inv_sql_parse.Dockerfile -t aisec:crd_inv_sql_parse .
3 | docker build -f inv_sql_train.Dockerfile -t aisec:crd_inv_sql_train .
4 | docker build -f inv_sql_predict.Dockerfile -t aisec:crd_inv_sql_predict .
5 | 


--------------------------------------------------------------------------------
/inv_aut/build.sh:
--------------------------------------------------------------------------------
1 | #/bin/bash
2 | docker build -f inv_aut_parse.Dockerfile -t aisec:crd_inv_aut_parse .
3 | docker build -f inv_aut_train.Dockerfile -t aisec:crd_inv_aut_train .
4 | docker build -f inv_aut_predict.Dockerfile -t aisec:crd_inv_aut_predict .
5 | 
6 | 


--------------------------------------------------------------------------------
/inv_aut/inv_aut_train.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ai:python-keras
 2 | MAINTAINER Mohyun Park <mhpark@fsec.or.kr>
 3 | 
 4 | RUN mkdir /home/dockeruser/npy
 5 | RUN mkdir /home/dockeruser/models
 6 | 
 7 | COPY inv_aut_train.py /home/dockeruser/train.py
 8 | 
 9 | CMD python3 train.py
10 | 


--------------------------------------------------------------------------------
/inv_sql/inv_sql_parse.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ai:python-common
2 | MAINTAINER HyeSeong Jeong <hsjeong@fsec.or.kr>
3 | 
4 | RUN mkdir /home/dockeruser/data
5 | RUN mkdir /home/dockeruser/npy
6 | 
7 | COPY inv_sql_parse.py /home/dockeruser/inv_sql_parse.py
8 | 
9 | CMD python3 inv_sql_parse.py


--------------------------------------------------------------------------------
/inv_aut/inv_aut_parse.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ai:python-common
 2 | MAINTAINER Mohyun Park <mhpark@fsec.or.kr>
 3 | 
 4 | RUN mkdir /home/dockeruser/data
 5 | RUN mkdir /home/dockeruser/npy
 6 | 
 7 | COPY inv_app_parse.py /home/dockeruser/inv_app_parse.py
 8 | 
 9 | CMD python3 inv_app_parse.py
10 | 


--------------------------------------------------------------------------------
/inv_app/inv_app_crd_parse.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ai:python-common
 2 | MAINTAINER Aechan Kim <ackim@fsec.or.kr>
 3 | 
 4 | RUN mkdir /home/dockeruser/data
 5 | RUN mkdir /home/dockeruser/npy
 6 | 
 7 | COPY inv_app_crd_parse.py /home/dockeruser/inv_app_crd_parse.py
 8 | 
 9 | CMD python3 inv_app_crd_parse.py
10 | 


--------------------------------------------------------------------------------
/inv_app/inv_app_crd_train.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ai:python-keras
 2 | MAINTAINER Aechan Kim <ackim@fsec.or.kr>
 3 | 
 4 | RUN mkdir /home/dockeruser/npy
 5 | RUN mkdir /home/dockeruser/models
 6 | 
 7 | COPY inv_app_crd_train.py /home/dockeruser/inv_app_crd_train.py
 8 | 
 9 | CMD python3 inv_app_crd_train.py
10 | 


--------------------------------------------------------------------------------
/label_save.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ai:python-common
 2 | MAINTAINER Mohyun Park <mhpark@fsec.or.kr>
 3 | 
 4 | RUN mkdir /home/dockeruser/data
 5 | COPY data_save.py /home/dockeruser/data_save.py
 6 | COPY fsi_splunk.py /home/dockeruser/fsi_splunk.py
 7 | COPY splunk_queries.py /home/dockeruser/splunk_queries.py
 8 | 
 9 | CMD python3 data_save.py label
10 | 


--------------------------------------------------------------------------------
/data_save.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ai:python-common
 2 | MAINTAINER Mohyun Park <mhpark@fsec.or.kr>
 3 | 
 4 | RUN mkdir /home/dockeruser/data
 5 | COPY data_save.py /home/dockeruser/data_save.py
 6 | COPY fsi_splunk.py /home/dockeruser/fsi_splunk.py
 7 | COPY splunk_queries.py /home/dockeruser/splunk_queries.py
 8 | 
 9 | CMD python3 data_save.py payload
10 | 


--------------------------------------------------------------------------------
/realtime_save.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ai:python-common
 2 | MAINTAINER Mohyun Park <mhpark@fsec.or.kr>
 3 | 
 4 | RUN mkdir /home/dockeruser/data
 5 | COPY data_save.py /home/dockeruser/data_save.py
 6 | COPY fsi_splunk.py /home/dockeruser/fsi_splunk.py
 7 | COPY splunk_queries.py /home/dockeruser/splunk_queries.py
 8 | 
 9 | CMD python3 data_save.py realtime
10 | 


--------------------------------------------------------------------------------
/data_backup.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ai:python-common
 2 | MAINTAINER Mohyun Park <mhpark@fsec.or.kr>
 3 | 
 4 | RUN mkdir /home/dockeruser/data
 5 | RUN mkdir /home/dockeruser/prediction
 6 | RUN mkdir /home/dockeruser/data_backup
 7 | RUN mkdir /home/dockeruser/prediction_backup
 8 | COPY data_backup.py /home/dockeruser/data_backup.py
 9 | 
10 | CMD python3 data_backup.py
11 | 


--------------------------------------------------------------------------------
/data_backup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import shutil
 4 | import datetime
 5 | 
 6 | 
 7 | def backup_data(data_dir, backup_dir, include_string, keep_days=7):
 8 | 
 9 |     past_day = datetime.datetime.today() + datetime.timedelta(days=-keep_days)
10 |     past_day_string = past_day.strftime("%Y%m%d")
11 | 
12 |     for filename in os.listdir(data_dir):
13 |         if past_day_string in filename and include_string in filename:
14 |             from_path = os.path.join(data_dir, filename)
15 |             to_path = os.path.join(backup_dir, filename)
16 |             shutil.move(from_path, to_path)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     backup_data("./data", "./data_backup", "total", keep_days=3)
21 |     backup_data("./data", "./data_backup", "payload", keep_days=3)
22 |     backup_data("./data", "./data_backup", "aut", keep_days=28)
23 |     backup_data("./data", "./data_backup", "app", keep_days=28)
24 |     backup_data("./data", "./data_backup", "sql", keep_days=28)
25 |     backup_data("./prediction", "./prediction_backup", "", keep_days=7)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Financial Security Institite (FSI).
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/data_split.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import datetime
 4 | 
 5 | 
 6 | def classify_payload(datafile_list, category_list):
 7 | 
 8 |     category_dict = {
 9 |         "payload" : "",
10 |         "app" : "응용프로그램 취약점 공격",
11 |         "aut" : "패스워드 추측 및 인증우회 공격",
12 |         "sql" : "SQL Injection 공격"
13 |     }
14 | 
15 |     for filename in datafile_list:
16 |         dirname = os.path.dirname(filename)
17 |         ymd = str(os.path.basename(filename).split('_')[0])
18 |         df = pd.read_csv(filename, header=0)
19 |         df = df.fillna(value={"MID_CATE_NM": ""})
20 |         for category in category_list:
21 |             new_filename = os.path.join(dirname, ymd + "_" + category + ".csvx")
22 |             df[df["MID_CATE_NM"]==category_dict[category]].to_csv(
23 |                 new_filename,
24 |                 index=False,
25 |                 encoding="utf-8"
26 |             )
27 |             os.umask(0)
28 |             os.chmod(new_filename, 0o666)
29 |             os.umask(0o027)
30 | 
31 | 
32 | def classify_label(labelfile_list):
33 |     for filename in labelfile_list:
34 |         dirname = os.path.dirname(filename)
35 |         ymd = str(os.path.basename(filename).split('_')[0])
36 |         df = pd.read_csv(filename, header=0)
37 |         for model_name in df.model_name.unique():
38 |             new_filename = os.path.join(dirname, ymd + "_" + model_name + ".csvx")
39 |             df[df["model_name"]==model_name].to_csv(
40 |                 new_filename,
41 |                 index=False,
42 |                 encoding="utf-8"
43 |             )
44 |             os.umask(0)
45 |             os.chmod(new_filename, 0o666)
46 |             os.umask(0o027)
47 | 
48 | 
49 | if __name__ == "__main__":
50 | 
51 |     mid_category_list = ["payload", "app", "aut", "sql"]
52 |     day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2)
53 |     day_string = day_before_yesterday.strftime("%Y%m%d")
54 |     data_dir = "./data"
55 |     file_list = list(os.path.join(data_dir, f)
56 |                      for f in os.listdir(data_dir)
57 |                      if day_string + "_total" in f)
58 | 
59 |     label_list = list(os.path.join(data_dir, f)
60 |                      for f in os.listdir(data_dir)
61 |                      if day_string + "_label" in f)
62 | 
63 |     classify_payload(file_list, mid_category_list)
64 |     classify_label(label_list)
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # License 
 2 | Our AI-IDS software contains the following license and notice below: Licensed under the MIT License. 
 3 | 
 4 | # AI-IDS
 5 | AI-IDS: Application of Deep Learning to Real-time Web Intrusion Detection
 6 | 
 7 | We implemented and applied our Artificial Intelligence-based Intrusion Detection System (AI-IDS) to real-time web traffic for distinguishing sophisticated attacks such as unknown patterns, encoded payload or obfuscated attacks, from benign traffic. It also helps writing and improving Snort rules based on newly identified patterns. The AI-IDS is a flexible and scalable system that is implemented based on Docker images, separating user-defined functions by independent images. We designed a CNN-LSTM model structure based on normalized UTF-8 encoding in bigdata-scale web traffic.
 8 | 
 9 | # payload_analysis
10 | This AI-IDS software that can be running in Splunk environment.
11 | 
12 | More details:
13 |  The paper "AI-IDS: Application of Deep Learning to Real-time Web Intrusion Detection" has been published in IEEE Access, Vol. 8, 2020.
14 |  doi:10.1109/ACCESS.2020.2986882
15 |  Authors: Aechan Kim, Mohyun Park, DongHoon Lee 
16 | 
17 | 
18 | # Bio
19 | Aechan Kim (ackim@fsec.or.kr) is an assistant manager in Financial Security Institute (FSI), Yongin, South Korea. He received the B.S. degree in Industrial Engineering from Seoul National University of Science and Technology, Seoul, South Korea, in 2009, and the M.S. degree in financial information security from Korea University, Seoul, in 2014, where he is currently pursuing the Ph.D. degree in Graduate School of Information Security. 
20 | 
21 | Mohyun Park (mhpark@fsec.or.kr) is a manager in Financial Security Institute (FSI), Yongin, South Korea. He received the B.S. degree in Computer Science from Seoul National University, Seoul, South Korea, in 2013. 
22 | 
23 | Dong Hoon Lee (donghlee@korea.ac.kr) received the B.S. degree from the Department of Economics, Korea University, Seoul, in 1985, and the M.S. and Ph.D. degrees in computer science from The University of Oklahoma, Norman, in 1988 and 1992, respectively. Since 1993, he has been with the Faculty of Computer Science and Information Security, Korea University. He is currently a Professor and the Director of the Graduate School of Information Security, Korea University.
24 | 
25 | # Acknowledgments
26 | This research was supported by Financial Security Institute (FSI), South Korea.
27 | 


--------------------------------------------------------------------------------
/fsi_splunk.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import requests
 3 | import time
 4 | import re
 5 | import xml.etree.ElementTree as et
 6 | 
 7 | 
 8 | #########################################################
 9 | ### Splunk Query Setup                                  #
10 | ### search_query (string) : Query                     #
11 | ### period (int) : duration for task completion (sec)        #
12 | ### output_format (string) : outputfile format (csv, xml, json) #
13 | ### auth (string tuple) : (ID, PW)                       #
14 | ### output_count (int) : Number of ouput count          #
15 | ### output file format : csv, xml, json                 #
16 | ### return (string) : result of query                   #
17 | #########################################################
18 | def query(splunk_host, search_query, check_frequency, output_format, auth, sample_ratio=1, output_count=0):
19 | 
20 |     if not search_query.startswith('|'):
21 | 
22 |         if 'latest' not in search_query:
23 |             search_query = 'latest=now ' + search_query
24 | 
25 |         if 'earliest' not in search_query:
26 |             search_query = 'earliest=-15m@m ' + search_query
27 | 
28 |         if not search_query.startswith('search'):
29 |             search_query = 'search ' + search_query
30 | 
31 | 
32 |     if output_format not in ['csv', 'xml','json']:
33 |         return ''
34 | 
35 |     splunk_job_url = splunk_host + "/services/search/jobs"
36 |     search_response = requests.post(splunk_job_url,
37 |                                     data = {'search':search_query,
38 |                                             'dispatch.sample_ratio':sample_ratio},
39 |                                     auth = auth,
40 |                                     verify=False)
41 |     # Job has been submitted.
42 |     try:
43 |         search_root = et.fromstring(search_response.text)
44 |         splunk_sid = search_root.find('sid').text
45 |     except AttributeError:
46 |         print(search_response.text)
47 |         exit(0)
48 |         return None
49 |     except et.ParseError:
50 |         print(search_response.text)
51 |         exit(0)
52 | 
53 |     while True:
54 |         time.sleep(check_frequency)
55 |         job_response = requests.get(splunk_job_url + '/' + splunk_sid,
56 |                                     auth = auth,
57 |                                     verify=False)
58 | 
59 |         job_status = re.search('<s:key name="dispatchState">(.+)</s:key>', job_response.text).group(1)
60 | 
61 |         #job_root = et.fromstring(job_response.text)
62 |         #ns = {'atom': 'http://www.w3.org/2005/Atom', 's': 'http://dev.splunk.com/ns/rest'}
63 |         #job_status = job_root.find("./atom:content/s:dict/s:key[@name='dispatchState']", ns).text
64 | 
65 |         if job_status == 'DONE': # Job is finished
66 |             break
67 |         if job_status == 'FAILED': # Job is finished
68 |             print('Search Failed!')
69 |             print(job_response.text)
70 |             exit(0)
71 | 
72 | 
73 |     splunk_result = requests.get(splunk_job_url + '/' + splunk_sid
74 |                                  + '/results?output_mode=' + output_format
75 |                                  + '&count=' + str(output_count),
76 |                                  auth = auth,
77 |                                  verify=False)
78 | 
79 |     if '<msg type="' in splunk_result.text and '<messages>' in splunk_result.text:
80 |         print(splunk_result.text)
81 |         return None
82 | 
83 |     return splunk_result.text.strip('\n')
84 | 


--------------------------------------------------------------------------------
/splunk_queries.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # general search for splunk queries
  3 | 
  4 | 
  5 | def search_query_total(earliest_minute=-1450, latest_minute=-1440, headers=[]):
  6 |     args = {
  7 |             "earliest_minute": str(earliest_minute),
  8 |             "latest_minute": str(latest_minute),
  9 |             "earliest_notable": str(earliest_minute-10),
 10 |             "latest_notable": str(latest_minute+10),
 11 |             "headers": ' '.join(headers)
 12 |     }
 13 | 
 14 |     query = """
 15 | | table {headers}
 16 | """
 17 |     query = query.format(**args)
 18 | 
 19 |     return query
 20 | 
 21 | 
 22 | def search_query_payload(earliest_minute=-205, latest_minute=-195, headers=[]):
 23 |     args = {
 24 |             "earliest_minute": str(earliest_minute),
 25 |             "latest_minute": str(latest_minute),
 26 |             "headers": ' '.join(headers)
 27 |     }
 28 | 
 29 |     query = """
 30 | | table {headers}
 31 | """
 32 |     query = query.format(**args)
 33 | 
 34 |     return query
 35 | 
 36 | 
 37 | def search_query_label(earliest_minute=-1450, latest_minute=-1440, headers=[]):
 38 |     args = {
 39 |             "earliest_minute": str(earliest_minute),
 40 |             "latest_minute": str(latest_minute),
 41 |             "headers": ' '.join(headers)
 42 |     }
 43 | 
 44 |     query = """
 45 | | table {headers}
 46 | """
 47 |     query = query.format(**args)
 48 | 
 49 |     return query
 50 | 
 51 | 
 52 | 
 53 | def update_notable_lookup(lookup_name, bulk_string, column_order):
 54 | 
 55 |     if not column_order:
 56 |         column_order = ["_time","src_ip", "src_port", "dest_ip", "dest_port", "src_content"]
 57 |     eval_query = ""
 58 |     for idx in range(len(column_order)):
 59 |         eval_query = eval_query + '''
 60 | | eval '''
 61 |         eval_query = eval_query + column_order[idx] + " = mvindex(item_list, "
 62 |         eval_query = eval_query + str(idx) + ")"
 63 | 
 64 |     args = {
 65 |         "bulk_string" : bulk_string,
 66 |         "eval_query" : eval_query,
 67 |         "column_order" : ' '.join(column_order),
 68 |         "lookup_name" : lookup_name
 69 |     }
 70 |     query = ''
 71 |     query = query + '''
 72 | | noop
 73 | 
 74 | | outputlookup {lookup_name}'''
 75 |     query = query.format(**args)
 76 | 
 77 |     return query
 78 | 
 79 | 
 80 | 
 81 | def search_query_notable_category(top_category, mid_category, earliest_minute=-1450, latest_minute=-1440):
 82 | 
 83 |     earliest_minute_tas = earliest_minute - 3
 84 |     latest_minute_tas = latest_minute + 2
 85 | 
 86 |     query = ''
 87 |     query = query + 'earliest=' + str(earliest_minute_tas) + 'm@m '
 88 |     query = query + 'latest=' + str(latest_minute_tas) + 'm@m '
 89 |     query = query + '''
 90 | index=fsi_tas_payload 
 91 | app=http
 92 | src_content=*
 93 | [
 94 | search '''
 95 |     query = query + 'earliest=' + str(earliest_minute) + 'm@m '
 96 |     query = query + 'latest=' + str(latest_minute) + 'm@m '
 97 |     query = query + 'index=notable TOP_CATE_NM="' + top_category + '" MID_CATE_NM="' + mid_category + '"'
 98 |     query = query + '''
 99 | text=*
100 | 
101 | [
102 | search '''
103 |     query = query + 'earliest=' + str(earliest_minute) + 'm@m '
104 |     query = query + 'latest=' + str(latest_minute) + 'm@m '
105 |     query = query + 'index=notable TOP_CATE_NM="' + top_category + '" MID_CATE_NM="' + mid_category + '"'
106 |     query = query + '''
107 | 
108 | '''
109 |     return query
110 | 
111 | 
112 | def search_query_notable_category_test(top_category, mid_category, earliest_minute=-1450, latest_minute=-1440):
113 | 
114 |     latest_notable = latest_minute + 240
115 |     if latest_notable > 0:
116 |         latest_notable = 0
117 |     args = {
118 |             "earliest_minute": str(earliest_minute),
119 |             "latest_minute": str(latest_minute),
120 |             "latest_notable": str(latest_notable),
121 |             "top_category": top_category,
122 |             "mid_category": mid_category
123 |         }
124 |     query = """
125 | 
126 | """
127 |     query = query.format(**args)
128 | 
129 |     return query
130 | 
131 | 
132 | def search_query_payload_test(earliest_minute=-1450, latest_minute=-1440):
133 | 
134 |     args = {
135 |             "earliest_minute": str(earliest_minute),
136 |             "latest_minute": str(latest_minute)
137 |     }
138 | 
139 |     query = """
140 |     
141 | """
142 |     query = query.format(**args)
143 | 
144 |     return query
145 | 
146 | 


--------------------------------------------------------------------------------
/data_save.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import datetime
  4 | import sys
  5 | import shutil
  6 | 
  7 | 
  8 | def append_data(data_dir, data_type, splunk_query, credentials, headers, data_count=0, sample_ratio=1):
  9 | 
 10 |     yesterday = datetime.datetime.today() + datetime.timedelta(days=-1)
 11 | 
 12 |     yesterday_string = yesterday.strftime("%Y%m%d")
 13 |     new_filename = data_dir + yesterday_string + "_" + data_type + ".csvx"
 14 | 
 15 |     if not os.path.exists(new_filename):
 16 |         os.umask(0)
 17 |         with open(os.open(new_filename, os.O_CREAT | os.O_WRONLY, 0o666), 'a') as f:
 18 |             f.write(','.join(headers) + '\n')
 19 | 
 20 |     # 전일 데이터 입력
 21 |     splunk_data = None
 22 |     while splunk_data is None:
 23 |         splunk_data = fsi_splunk.query(
 24 |             "http://192.168.143.39:8089",
 25 |             splunk_query,
 26 |             check_frequency=10,
 27 |             output_format='csv',
 28 |             auth=credentials,
 29 |             sample_ratio=sample_ratio,
 30 |             output_count=data_count
 31 |         )
 32 | 
 33 |     if len(splunk_data) > 0:
 34 |         with open(new_filename,
 35 |                   mode='a',
 36 |                   encoding='utf-8',
 37 |                   newline='') as f:
 38 |             splunk_data = splunk_data[splunk_data.index('\n'):]
 39 |             f.write(splunk_data + "\n")
 40 | 
 41 | 
 42 | def replace_data(data_path, splunk_query, credentials, headers, data_count=0, sample_ratio=1):
 43 | 
 44 |     splunk_data = None
 45 |     while splunk_data is None:
 46 |         splunk_data = fsi_splunk.query(
 47 |             "http://192.168.143.39:8089",
 48 |             splunk_query,
 49 |             check_frequency=10,
 50 |             output_format='csv',
 51 |             auth=credentials,
 52 |             output_count=data_count
 53 |         )
 54 | 
 55 |     with open(data_path,
 56 |               mode='w',
 57 |               encoding='utf-8',
 58 |               newline='') as f:
 59 |         f.write(splunk_data + '\n')
 60 | 
 61 | 
 62 | if __name__ == "__main__":
 63 |     import fsi_splunk
 64 |     from splunk_queries import search_query_total, search_query_label, search_query_payload
 65 | 
 66 |     credentials = ('airesearch', 'airflow!@')
 67 | 
 68 |     if sys.argv[-1] == "payload":
 69 | 
 70 |         payload_headers = [
 71 |             "_time",
 72 |             "src_ip",
 73 |             "src_port",
 74 |             "dest_ip",
 75 |             "dest_port",
 76 |             "src_content",
 77 |             "TOP_CATE_NM",
 78 |             "MID_CATE_NM",
 79 |             "suppression",
 80 |             "desc",
 81 |             "drill",
 82 |             "msg",
 83 |             "label"
 84 |         ]
 85 | 
 86 |         append_data(
 87 |             "./data/",
 88 |             "total",
 89 |             search_query_total(headers=payload_headers),
 90 |             credentials=credentials,
 91 |             headers=payload_headers,
 92 |             data_count=500000,
 93 |         )
 94 | 
 95 |     elif sys.argv[-1] == "label":
 96 | 
 97 |         label_headers = [
 98 |             "_time",
 99 |             "src_ip",
100 |             "src_port",
101 |             "dest_ip",
102 |             "dest_port",
103 |             "src_content",
104 |             "model_name",
105 |             "label",
106 |             "comment"
107 |         ]
108 | 
109 |         append_data(
110 |             "./data/",
111 |             "label",
112 |             search_query_label(earliest_minute=-1500, latest_minute=-1440, headers=label_headers),
113 |             credentials=credentials,
114 |             headers=label_headers,
115 |             data_count=100000
116 |         )
117 | 
118 |     elif sys.argv[-1] == "realtime":
119 | 
120 |         realtime_headers = [
121 |             "_time",
122 |             "tas",
123 |             "src_ip",
124 |             "src_port",
125 |             "dest_ip",
126 |             "dest_port",
127 |             "src_content",
128 |         ]
129 | 
130 |         data_dir = "./data/"
131 |         holder_filename = "payload_holder.tmp"
132 |         payload_filename = "payload_data.tmp"
133 | 
134 |         holder_path = os.path.join(data_dir, holder_filename)
135 |         payload_path = os.path.join(data_dir, payload_filename)
136 | 
137 |         replace_data(
138 |             holder_path,
139 |             search_query_payload(earliest_minute=-200, latest_minute=-180, headers=realtime_headers),
140 |             credentials=credentials,
141 |             headers=realtime_headers,
142 |             data_count=1000000
143 |         )
144 |         shutil.copy(holder_path, payload_path)
145 | 


--------------------------------------------------------------------------------
/inv_sql/inv_sql_parse.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import numpy as np
  4 | import gc
  5 | import multiprocessing
  6 | import pandas
  7 | import datetime
  8 | import sys
  9 | from pandas.errors import EmptyDataError
 10 | 
 11 | from functools import partial
 12 | 
 13 | 
 14 | def convert_content(content_string, x_dim, pad_before=True):
 15 | 
 16 | 	int_list = list(map(np.uint8, str(content_string).encode('utf-8')))[:x_dim]
 17 | 	if len(int_list) < x_dim:
 18 | 		if pad_before:
 19 | 			int_list = [np.uint8(0)] * (x_dim - len(int_list)) + int_list  # Pad Before
 20 | 		else:
 21 | 			int_list = int_list + [np.uint8(0)] * (x_dim - len(int_list))  # Pad After
 22 | 
 23 | 	return int_list
 24 | 
 25 | def convert_data(start_index, filename, npy_dir, batch_size, x_dim, pad_before=True, augmentation=1):
 26 | 
 27 | 	try:
 28 | 		dataframe = pandas.read_csv(filename,
 29 | 					header=0,
 30 | 					usecols=["src_content", "label"],
 31 | 					skiprows=list(range(1, start_index)),
 32 | 					nrows=batch_size,
 33 | 					engine='python')
 34 | 		labels = dataframe["label"].values.astype(np.uint8)
 35 | 	except ValueError:
 36 | 		dataframe = pandas.read_csv(filename,
 37 | 					header=0,
 38 | 					usecols=["src_content"],
 39 | 					skiprows=list(range(1, start_index)),
 40 | 					nrows=batch_size,
 41 | 					engine='python')
 42 | 		labels = np.array([np.uint8(0)] * dataframe.shape[0])
 43 | 
 44 | 	labels = labels.reshape((labels.shape[0], 1))
 45 | 	src_content = list(convert_content(x, x_dim=x_dim, pad_before=pad_before)
 46 | 			   for x in dataframe["src_content"].values)
 47 | 
 48 | 	src_content_aug = src_content
 49 | 	labels_aug = np.concatenate(tuple([labels] * augmentation))
 50 | 
 51 | 	for i in range(1, augmentation):
 52 | 		if pad_before:
 53 | 			src_content_aug = src_content_aug + list(
 54 | 				[np.uint8(0)]*i + content[:-i] for content in src_content
 55 | 			)
 56 | 		else:
 57 | 			src_content_aug = src_content_aug + list(
 58 | 				content[:-i] + [np.uint8(0)] * i for content in src_content
 59 | 			)
 60 | 
 61 | 	src_content_aug = np.array(src_content_aug)
 62 | 	file_no = int(start_index / batch_size)
 63 | 	if pad_before:
 64 | 		pad_string = '_prepad'
 65 | 	else:
 66 | 		pad_string = '_postpad'
 67 | 
 68 | 	basename = os.path.basename(filename)
 69 | 	file_extension_index = basename.rfind('.')
 70 | 	save_basename = basename[:file_extension_index] + pad_string + '_' + str(file_no) + '.npy'
 71 | 	save_filename = os.path.join(npy_dir, save_basename)
 72 | 	np.save(save_filename, np.concatenate((src_content_aug, labels_aug), axis=1))
 73 | 	gc.collect()
 74 | 
 75 | 	return
 76 | 
 77 | 
 78 | def convert_file_list(datafile_list, npy_dir, x_dim=1000, pad_before=True, augmentation=1):
 79 | 
 80 | 	processors = int(multiprocessing.cpu_count() / 1.5)
 81 | 	line_per_processor = int(1048576 / augmentation) # pow(2, 20)
 82 | 
 83 | 	for filepath in datafile_list:
 84 | 		if pad_before:
 85 | 			pad_string = '_prepad'
 86 | 		else:
 87 | 			pad_string = '_postpad'
 88 | 
 89 | 		filename = os.path.basename(filepath)
 90 | 		file_extension_index = filename.rfind('.')
 91 | 		npy_filename = filename[:file_extension_index] + pad_string + '_0.npy'
 92 | 
 93 | 		if npy_filename in os.listdir(npy_dir):
 94 | 			continue
 95 | 
 96 | 		try:
 97 | 			df_temp = pandas.read_csv(filepath, header=0, engine='python')
 98 | 		except EmptyDataError:
 99 | 			continue
100 | 
101 | 		row_count = df_temp.shape[0]
102 | 		del(df_temp)
103 | 		gc.collect()
104 | 
105 | 		pool = multiprocessing.Pool(processes=processors)
106 | 
107 | 		split_size = int(np.ceil(row_count / line_per_processor))
108 | 		index_list = list(range(0, split_size*line_per_processor, line_per_processor))
109 | 
110 | 		pool.map(partial(convert_data,
111 | 				 filename=filepath,
112 | 				 batch_size=line_per_processor,
113 | 				 x_dim=1000,
114 | 				 pad_before=pad_before,
115 | 				 augmentation=augmentation
116 | 				 ),
117 | 			 index_list)
118 | 
119 | 		pool.close()
120 | 		pool.join()
121 | 		gc.collect()
122 | 
123 | 
124 | if __name__ == "__main__":
125 | 	yesterday = datetime.datetime.today() + datetime.timedelta(days=-1)
126 | 	day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2)
127 | 	yesterday_string = yesterday.strftime("%Y%m%d")
128 | 	day_before_yesterday_string = day_before_yesterday.strftime("%Y%m%d")
129 | 	data_dir = "./data/"
130 | 	npy_dir = "./npy/"
131 | 	
132 | 	payload_file_list = list(os.path.join(data_dir, f)
133 | 				for f in os.listdir(data_dir)
134 | 				if "payload" in f and day_before_yesterday_string in f)
135 | 
136 | 	sql_file_list = list(os.path.join(data_dir, f)
137 | 				for f in os.listdir(data_dir)
138 | 				if "sql" in f and yesterday_string not in f)
139 | 
140 | 	label_file_list = list(os.path.join(data_dir, f)
141 | 				for f in os.listdir(data_dir)
142 | 				if "inv-sql" in f and yesterday_string not in f)
143 | 
144 | 	convert_file_list(payload_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=1)
145 | 	convert_file_list(sql_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=20)
146 | 	convert_file_list(label_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=20)
147 | 


--------------------------------------------------------------------------------
/inv_aut/inv_aut_parse.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import numpy as np
  4 | import gc
  5 | import multiprocessing
  6 | import pandas
  7 | import datetime
  8 | import sys
  9 | from pandas.errors import EmptyDataError
 10 | 
 11 | from functools import partial
 12 | 
 13 | 
 14 | 
 15 | def convert_content(content_string, x_dim, pad_before=True):
 16 | 
 17 |     int_list = list(map(np.uint8, str(content_string).encode('utf-8')))[:x_dim]
 18 |     if len(int_list) < x_dim:
 19 |         if pad_before:
 20 |             int_list = [np.uint8(0)] * (x_dim - len(int_list)) + int_list  # Pad Before
 21 |         else:
 22 |             int_list = int_list + [np.uint8(0)] * (x_dim - len(int_list))  # Pad After
 23 | 
 24 |     return int_list
 25 | 
 26 | 
 27 | def convert_data(start_index, filename, npy_dir, batch_size, x_dim, pad_before=True, augmentation=1):
 28 | 
 29 |     try:
 30 |         dataframe = pandas.read_csv(filename,
 31 |                                     header=0,
 32 |                                     usecols=["src_content", "label"],
 33 |                                     skiprows=list(range(1, start_index)),
 34 |                                     nrows=batch_size,
 35 |                                     engine='python')
 36 |         labels = dataframe["label"].values.astype(np.uint8)
 37 |     except ValueError:
 38 |         dataframe = pandas.read_csv(filename,
 39 |                                     header=0,
 40 |                                     usecols=["src_content"],
 41 |                                     skiprows=list(range(1, start_index)),
 42 |                                     nrows=batch_size,
 43 |                                     engine='python')
 44 |         labels = np.array([np.uint8(0)] * dataframe.shape[0])
 45 | 
 46 |     labels = labels.reshape((labels.shape[0], 1))
 47 |     src_content = list(convert_content(x, x_dim=x_dim, pad_before=pad_before)
 48 |                        for x in dataframe["src_content"].values)
 49 | 
 50 |     src_content_aug = src_content
 51 |     labels_aug = np.concatenate(tuple([labels] * augmentation))
 52 | 
 53 |     for i in range(1, augmentation):
 54 |         if pad_before:
 55 |             src_content_aug = src_content_aug + list(
 56 |                 [np.uint8(0)]*i + content[:-i] for content in src_content
 57 |             )
 58 |         else:
 59 |             src_content_aug = src_content_aug + list(
 60 |                 content[:-i] + [np.uint8(0)] * i for content in src_content
 61 |             )
 62 | 
 63 |     src_content_aug = np.array(src_content_aug)
 64 |     file_no = int(start_index / batch_size)
 65 |     if pad_before:
 66 |         pad_string = '_prepad'
 67 |     else:
 68 |         pad_string = '_postpad'
 69 | 
 70 |     basename = os.path.basename(filename)
 71 |     file_extension_index = basename.rfind('.')
 72 |     save_basename = basename[:file_extension_index] + pad_string + '_' + str(file_no) + '.npy'
 73 |     save_filename = os.path.join(npy_dir, save_basename)
 74 |     np.save(save_filename, np.concatenate((src_content_aug, labels_aug), axis=1))
 75 |     gc.collect()
 76 | 
 77 |     return
 78 | 
 79 | 
 80 | def convert_file_list(datafile_list, npy_dir, x_dim=1000, pad_before=True, augmentation=1):
 81 | 
 82 |     processors = int(multiprocessing.cpu_count() / 1.5)
 83 |     line_per_processor = int(1048576 / augmentation) # pow(2, 20)
 84 | 
 85 |     for filepath in datafile_list:
 86 |         if pad_before:
 87 |             pad_string = '_prepad'
 88 |         else:
 89 |             pad_string = '_postpad'
 90 | 
 91 |         filename = os.path.basename(filepath)
 92 |         file_extension_index = filename.rfind('.')
 93 |         npy_filename = filename[:file_extension_index] + pad_string + "_0.npy"
 94 | 
 95 |         if npy_filename in os.listdir(npy_dir): # Check already parsed npy existence
 96 |             continue
 97 | 
 98 |         try:
 99 |             df_temp = pandas.read_csv(filepath, header=0, engine='python')
100 |         except EmptyDataError:
101 |             continue
102 | 
103 |         row_count = df_temp.shape[0]
104 |         del(df_temp)
105 |         gc.collect()
106 | 
107 |         pool = multiprocessing.Pool(processes=processors)
108 | 
109 |         split_size = int(np.ceil(row_count / line_per_processor))
110 |         index_list = list(range(0, split_size*line_per_processor, line_per_processor))
111 | 
112 |         pool.map(partial(convert_data,
113 |                          filename=filepath,
114 |                          npy_dir=npy_dir,
115 |                          batch_size=line_per_processor,
116 |                          x_dim=x_dim,
117 |                          pad_before=pad_before,
118 |                          augmentation=augmentation
119 |                          ),
120 |                  index_list)
121 | 
122 |         pool.close()
123 |         pool.join()
124 |         gc.collect()
125 | 
126 | 
127 | if __name__ == "__main__":
128 | 
129 |     yesterday = datetime.datetime.today() + datetime.timedelta(days=-1)
130 |     day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2)
131 |     yesterday_string = yesterday.strftime("%Y%m%d")
132 |     day_before_yesterday_string = day_before_yesterday.strftime("%Y%m%d")
133 |     data_dir = "./data/"
134 |     npy_dir = "./npy/"
135 | 
136 |     payload_file_list = list(os.path.join(data_dir, f)
137 |                              for f in os.listdir(data_dir)
138 |                              if "payload" in f and day_before_yesterday_string in f)
139 | 
140 |     aut_file_list = list(os.path.join(data_dir, f)
141 |                          for f in os.listdir(data_dir)
142 |                          if "aut" in f and yesterday_string not in f)
143 | 
144 |     label_file_list = list(os.path.join(data_dir, f)
145 |                            for f in os.listdir(data_dir)
146 |                            if "INV-AUT" in f and yesterday_string not in f)
147 | 
148 |     convert_file_list(payload_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=1)
149 |     convert_file_list(aut_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=20)
150 |     convert_file_list(label_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=20)
151 | 
152 | 


--------------------------------------------------------------------------------
/inv_app/inv_app_crd_parse.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import numpy as np
  4 | import gc
  5 | import multiprocessing
  6 | import pandas
  7 | import datetime
  8 | import sys
  9 | from pandas.errors import EmptyDataError
 10 | 
 11 | from functools import partial
 12 | 
 13 | 
 14 | 
 15 | def convert_content(content_string, x_dim, pad_before=True):
 16 | 
 17 |     int_list = list(map(np.uint8, str(content_string).encode('utf-8')))[:x_dim]
 18 |     if len(int_list) < x_dim:
 19 |         if pad_before:
 20 |             int_list = [np.uint8(0)] * (x_dim - len(int_list)) + int_list  # Pad Before
 21 |         else:
 22 |             int_list = int_list + [np.uint8(0)] * (x_dim - len(int_list))  # Pad After
 23 | 
 24 |     return int_list
 25 | 
 26 | 
 27 | def convert_data(start_index, filename, npy_dir, batch_size, x_dim, pad_before=True, augmentation=1):
 28 | 
 29 |     try:
 30 |         dataframe = pandas.read_csv(filename,
 31 |                                     header=0,
 32 |                                     usecols=["src_content", "label"],
 33 |                                     skiprows=list(range(1, start_index)),
 34 |                                     nrows=batch_size,
 35 |                                     engine='python')
 36 |         labels = dataframe["label"].values.astype(np.uint8)
 37 |     except ValueError:
 38 |         dataframe = pandas.read_csv(filename,
 39 |                                     header=0,
 40 |                                     usecols=["src_content"],
 41 |                                     skiprows=list(range(1, start_index)),
 42 |                                     nrows=batch_size,
 43 |                                     engine='python')
 44 |         labels = np.array([np.uint8(0)] * dataframe.shape[0])
 45 | 
 46 |     labels = labels.reshape((labels.shape[0], 1))
 47 |     src_content = list(convert_content(x, x_dim=x_dim, pad_before=pad_before)
 48 |                        for x in dataframe["src_content"].values)
 49 | 
 50 |     src_content_aug = src_content
 51 |     labels_aug = np.concatenate(tuple([labels] * augmentation))
 52 | 
 53 |     for i in range(1, augmentation):
 54 |         if pad_before:
 55 |             src_content_aug = src_content_aug + list(
 56 |                 [np.uint8(0)]*i + content[:-i] for content in src_content
 57 |             )
 58 |         else:
 59 |             src_content_aug = src_content_aug + list(
 60 |                 content[:-i] + [np.uint8(0)] * i for content in src_content
 61 |             )
 62 | 
 63 |     src_content_aug = np.array(src_content_aug)
 64 |     file_no = int(start_index / batch_size)
 65 |     if pad_before:
 66 |         pad_string = '_prepad'
 67 |     else:
 68 |         pad_string = '_postpad'
 69 | 
 70 |     basename = os.path.basename(filename)
 71 |     file_extension_index = basename.rfind('.')
 72 |     save_basename = basename[:file_extension_index] + pad_string + '_' + str(file_no) + '.npy'
 73 |     save_filename = os.path.join(npy_dir, save_basename)
 74 |     np.save(save_filename, np.concatenate((src_content_aug, labels_aug), axis=1))
 75 |     gc.collect()
 76 | 
 77 |     return
 78 | 
 79 | 
 80 | def convert_file_list(datafile_list, npy_dir, x_dim=1000, pad_before=True, augmentation=1):
 81 | 
 82 |     processors = int(multiprocessing.cpu_count() / 1.5)
 83 |     line_per_processor = int(1048576 / augmentation) # pow(2, 20)
 84 | 
 85 |     for filepath in datafile_list:
 86 |         if pad_before:
 87 |             pad_string = '_prepad'
 88 |         else:
 89 |             pad_string = '_postpad'
 90 | 
 91 |         filename = os.path.basename(filepath)
 92 |         file_extension_index = filename.rfind('.')
 93 |         npy_filename = filename[:file_extension_index] + pad_string + "_0.npy"
 94 | 
 95 |         if npy_filename in os.listdir(npy_dir): # Check already parsed npy existence
 96 |             continue
 97 | 
 98 |         try:
 99 |             df_temp = pandas.read_csv(filepath, header=0, engine='python')
100 |         except EmptyDataError:
101 |             continue
102 | 
103 |         row_count = df_temp.shape[0]
104 |         del(df_temp)
105 |         gc.collect()
106 | 
107 |         pool = multiprocessing.Pool(processes=processors)
108 | 
109 |         split_size = int(np.ceil(row_count / line_per_processor))
110 |         index_list = list(range(0, split_size*line_per_processor, line_per_processor))
111 | 
112 |         pool.map(partial(convert_data,
113 |                          filename=filepath,
114 |                          npy_dir=npy_dir,
115 |                          batch_size=line_per_processor,
116 |                          x_dim=x_dim,
117 |                          pad_before=pad_before,
118 |                          augmentation=augmentation
119 |                          ),
120 |                  index_list)
121 | 
122 |         pool.close()
123 |         pool.join()
124 |         gc.collect()
125 | 
126 | 
127 | if __name__ == "__main__":
128 | 
129 |     yesterday = datetime.datetime.today() + datetime.timedelta(days=-1)
130 |     day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2)
131 |     yesterday_string = yesterday.strftime("%Y%m%d")
132 |     day_before_yesterday_string = day_before_yesterday.strftime("%Y%m%d")
133 |     data_dir = "./data/"
134 |     npy_dir = "./npy/"
135 | 
136 |     payload_file_list = list(os.path.join(data_dir, f)
137 |                              for f in os.listdir(data_dir)
138 |                              if "payload" in f and day_before_yesterday_string in f)
139 | 
140 |     app_file_list = list(os.path.join(data_dir, f)
141 |                          for f in os.listdir(data_dir)
142 |                          if "app" in f and yesterday_string not in f)
143 | 
144 |     label_file_list = list(os.path.join(data_dir, f)
145 |                            for f in os.listdir(data_dir)
146 |                            if "INV-APP" in f and yesterday_string not in f)
147 | 
148 |     convert_file_list(payload_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=1)
149 |     convert_file_list(app_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=20)
150 |     convert_file_list(label_file_list, npy_dir, x_dim=1000, pad_before=True, augmentation=20)
151 | 
152 | 


--------------------------------------------------------------------------------
/inv_sql/inv_sql_train.py:
--------------------------------------------------------------------------------
  1 | import keras.utils
  2 | import numpy as np
  3 | import datetime
  4 | import os
  5 | import gc
  6 | import keras.backend.tensorflow_backend as keras_tf_backend
  7 | import tensorflow as tf
  8 | import psutil
  9 | import threading
 10 | import shutil
 11 | 
 12 | from copy import deepcopy
 13 | from keras.models import Sequential
 14 | from keras.layers import Conv1D, MaxPooling1D
 15 | from keras.layers import LeakyReLU, BatchNormalization
 16 | from keras.layers import Dropout
 17 | from keras.layers import LSTM, Dense, Bidirectional, CuDNNLSTM
 18 | 
 19 | class DataSequence(keras.utils.Sequence):
 20 | 
 21 | 	def _find_npy_size(self, filename):
 22 | 		with open(filename, 'rb') as f:
 23 | 			data = str(f.read(100)[50:])
 24 | 			shape_index = data.find('shape')
 25 | 			comma_index = data[shape_index:].find(',')
 26 | 
 27 | 			return int(data[shape_index+9 : shape_index+comma_index])
 28 | 
 29 | 	def _load_cache(self, label, idx):
 30 | 
 31 | 		cache_end_index = self._cache_end_index_dict[label]
 32 | 		cache_start_index = self._cache_start_index_dict[label]
 33 | 		if (idx + 1) * self._batch_size_dict[label] <= cache_end_index:
 34 | 			return
 35 | 
 36 | 		threshold = self._cache_threshold_dict[label]
 37 | 		batch_size = self._batch_size_dict[label]
 38 | 
 39 | 		delete_count = int((idx - 64) * batch_size) - cache_start_index
 40 | 		if delete_count > 0:
 41 | 			self._cache_dict[label] = self._cache_dict[label][delete_count:]
 42 | 			self._cache_start_index_dict[label] = self._cache_start_index_dict[label] + delete_count
 43 | 
 44 | 		new_row_count = self._cache_dict[label].shape[0]
 45 | 		dict_size = self._cache_dict[label].nbytes
 46 | 		if new_row_count > 0:
 47 | 			self._avg_size = float(dict_size / new_row_count)
 48 | 
 49 | 		remaining_size = threshold - dict_size
 50 | 		temp_array_filename_list = []
 51 | 		while self._file_size_list_dict[label]:
 52 | 			file_name = self._file_list_dict[label].pop(0)
 53 | 			file_size = self._file_size_list_dict[label].pop(0)
 54 | 			temp_array_filename_list.append(file_name)
 55 | 			remaining_size = remaining_size - file_size * self._avg_size
 56 | 			self._cache_end_index_dict[label] = self._cache_end_index_dict[label] + file_size
 57 | 
 58 | 			if not self._file_size_list_dict[label]:
 59 | 				break
 60 | 			if remaining_size <= self._file_size_list_dict[label][0] * self._avg_size:
 61 | 				break
 62 | 
 63 | 		self._cache_dict[label] = np.concatenate(tuple([self._cache_dict[label]] + list(np.load(x) for x in temp_array_filename_list)))
 64 | 		gc.collect()
 65 | 
 66 | 		return
 67 | 
 68 | 	def _initialize_objects(self):
 69 | 		self._file_list_dict = deepcopy(self._backup_file_list_dict)
 70 | 		self._cache_dict = dict()
 71 | 		gc.collect()
 72 | 
 73 | 		for label in self._file_list_dict:
 74 | 			self._file_size_list_dict[label] = []
 75 | 			self._cache_dict[label] = np.empty((0, self._x_dim+1), dtype=np.uint8)
 76 | 			self._cache_start_index_dict[label] = 0
 77 | 			self._cache_end_index_dict[label] = 0
 78 | 			for filename in self._file_list_dict[label]:
 79 | 				npy_size = self._find_npy_size(filename)
 80 | 				self._file_size_list_dict[label].append(npy_size)
 81 | 
 82 | 		for label in self._cache_dict:
 83 | 			self._load_cache(label, 0)
 84 | 
 85 | 	def __init__(self, file_list_dict, x_dim, batch_size=1024, mem_share=0.2):
 86 | 
 87 | 		self._backup_file_list_dict = deepcopy(file_list_dict)
 88 | 		self._batch_size = batch_size
 89 | 		self._x_dim = x_dim
 90 | 		self._avg_size = (x_dim + 1)  
 91 | 		self._data_size = 0
 92 | 		self._file_size_list_dict = dict()
 93 | 		self._batch_size_dict = dict()
 94 | 		self._cache_threshold_dict = dict()
 95 | 		self._cache_start_index_dict = dict()
 96 | 		self._cache_end_index_dict = dict()
 97 | 		self._lock_dict = dict()
 98 | 
 99 | 		mem = psutil.virtual_memory()
100 | 		buffer = 0.25
101 | 
102 | 		total_threshold = mem.available * mem_share * (1 - buffer)
103 | 
104 | 		for label in file_list_dict:
105 | 			self._batch_size_dict[label] = 0
106 | 			self._lock_dict[label] = threading.Lock()
107 | 
108 | 			for filename in file_list_dict[label]:
109 | 				npy_size = self._find_npy_size(filename)
110 | 				self._batch_size_dict[label] = self._batch_size_dict[label] + npy_size
111 | 				self._data_size = self._data_size + npy_size
112 | 
113 | 		for label in self._batch_size_dict:
114 | 			# _batch_size_dict is not the batch size at this point. It is the total row count of each label.
115 | 			self._cache_threshold_dict[label] = int(total_threshold * self._batch_size_dict[label] / self._data_size)
116 | 
117 | 			# Dividing with total data size / label size to get the batch size of each label
118 | 			self._batch_size_dict[label] = float(self._batch_size_dict[label] * self._batch_size / self._data_size)
119 | 
120 | 		self._initialize_objects()
121 | 
122 | 	def __len__(self):
123 | 		return int(np.ceil(self._data_size / self._batch_size))
124 | 
125 | 	def __getitem__(self, idx):
126 | 
127 | 		data = np.empty((0, self._x_dim+1))
128 | 		for label in self._cache_end_index_dict:
129 | 
130 | 			start_index = int(idx * self._batch_size_dict[label])
131 | 			end_index = int((idx + 1) * self._batch_size_dict[label])
132 | 
133 | 			with self._lock_dict[label]:
134 | 				if end_index > self._cache_end_index_dict[label]:
135 | 					self._load_cache(label, idx)
136 | 				cache_start_index = start_index - self._cache_start_index_dict[label]
137 | 
138 | 			if cache_start_index < 0:
139 | 				cache_start_index = 0
140 | 
141 | 			cache_end_index = cache_start_index + (end_index - start_index)
142 | 			data = np.concatenate((data, self._cache_dict[label][cache_start_index:cache_end_index]))
143 | 
144 | 		data = data[:self._batch_size]
145 | 		np.random.shuffle(data)
146 | 
147 | 		train_x = data[:, :-1].reshape(data.shape[0], self._x_dim, 1)
148 | 		train_y = data[:, [-1]].reshape(data.shape[0], 1)
149 | 
150 | 		train_x = (train_x - 128.0) / -128.0
151 | 
152 | 		return (train_x, train_y)
153 | 
154 | 	def on_epoch_end(self):
155 | 		self._initialize_objects()
156 | 
157 | 
158 | def get_session(gpu_share=0.2, threads=2):
159 | 	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_share)
160 | 	config = tf.ConfigProto(inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads, gpu_options=gpu_options)
161 | 	config.gpu_options.allow_growth = True
162 | 
163 | 	return tf.Session(config=config)
164 | 
165 | 
166 | def initialize_hyperparameters(parameter_dict):
167 | 
168 | 	if parameter_dict is None:
169 | 		# Default Values
170 | 		parameter_tuple = (3, 12, 4, 6, 14, 4, 8, 0.5, 0.1)
171 | 	else:
172 | 		parameter_tuple = (
173 | 			parameter_dict["conv_depth"],
174 | 			parameter_dict["conv_filter"],
175 | 			parameter_dict["conv_kernel_width"],
176 | 			parameter_dict["conv_pool"],
177 | 			parameter_dict["lstm_units"],
178 | 			parameter_dict["dense_depth"],
179 | 			parameter_dict["dense_units"],
180 | 			parameter_dict["dense_dropout"],
181 | 			parameter_dict["dense_relu_alpha"]
182 | 		)
183 | 	return parameter_tuple
184 | 
185 | 
186 | def create_model(input_dim, hyperparameter_dict=None):
187 | 
188 | 	(conv_depth,
189 | 	 conv_filter,
190 | 	 conv_kernel_width,
191 | 	 conv_pool,
192 | 	 lstm_units,
193 | 	 dense_depth,
194 | 	 dense_units,
195 | 	 dense_dropout,
196 | 	 dense_relu_alpha
197 | 	) = initialize_hyperparameters(hyperparameter_dict)
198 | 
199 | 	model = Sequential()
200 | 
201 | 	# CNN Layer
202 | 	for i in range(conv_depth):
203 | 		conv_filter_size = conv_filter * pow(conv_pool, i)
204 | 		if i == 0:
205 | 			model.add(Conv1D(conv_filter_size,
206 | 							 conv_kernel_width,
207 | 							 padding='same',
208 | 							 activation='relu',
209 | 							 input_shape=(input_dim, 1)))
210 | 		else:
211 | 			model.add(Conv1D(conv_filter_size,
212 | 							 conv_kernel_width,
213 | 							 padding='same',
214 | 							 activation='relu'))
215 | 		model.add(MaxPooling1D(pool_size=conv_pool, padding='same'))
216 | 		model.add(BatchNormalization())
217 | 
218 | 	# RNN Layer
219 | 	if conv_depth > 0:
220 | 		(_, lstm_timesteps, lstm_features) = model.output_shape
221 | 		lstm_input_shape = (lstm_timesteps, lstm_features) # Get input from CNN
222 | 	else:
223 | 		lstm_input_shape = (input_dim, 1) # Starts with RNN
224 | 
225 | 	model.add(CuDNNLSTM(lstm_units, return_sequences=True, input_shape=lstm_input_shape))
226 | 	model.add(Bidirectional(CuDNNLSTM(lstm_units)))
227 | 
228 | 	for _ in range(dense_depth):
229 | 		model.add(Dense(dense_units))
230 | 		model.add(Dropout(dense_dropout))
231 | 		model.add(LeakyReLU(dense_relu_alpha))
232 | 		model.add(BatchNormalization())
233 | 
234 | 	# Output Layer
235 | 	model.add(Dense(1, activation='sigmoid'))
236 | 
237 | 	# print(model.summary())
238 | 	model.compile(optimizer='adam' , loss='binary_crossentropy', metrics=['accuracy'])
239 | 
240 | 	return model
241 | 
242 | 
243 | def train_model(model, datafile_dict, x_dim, save_model=False, model_name='default.h5', verbose=0):
244 | 
245 | 	processors = int(psutil.cpu_count() / 1.5)
246 | 	generator = DataSequence(datafile_dict, x_dim=x_dim, batch_size=8192, mem_share=0.2)
247 | 	model.fit_generator(generator=generator,
248 | 				epochs=5,
249 | 				verbose=verbose,
250 | 				shuffle=False,
251 | 				workers=processors,
252 | 			)
253 | 
254 | 	if save_model:
255 | 		model.save(model_name)
256 | 
257 | 
258 | def test_model(model, test_file_list, x_dim):
259 | 
260 | 	data = np.concatenate(tuple(
261 | 		list(np.load(filename) for filename in test_file_list)
262 | 	))
263 | 
264 | 	test_size = data.shape[0]
265 | 
266 | 	x_test = np.array(data[:, :-1])
267 | 	x_test = (x_test - 128.0) / -128.0
268 | 	x_test = x_test.reshape(test_size, x_dim, 1) # 1000 characters at a time, 1 channel
269 | 	y_test = data[:, [-1]].reshape(test_size, 1)
270 | 	y_prediction = model.predict(x=x_test,
271 | 								 batch_size=4096,
272 | 								 verbose=0)
273 | 
274 | 	y_merged = (y_prediction.round()*2 + y_test).flatten()
275 | 	value, counts = np.unique(y_merged, return_counts=True)
276 | 	value_str = list(map(lambda x: str(int(x)), value))
277 | 	metrics = dict(zip(value_str, counts))
278 | 
279 | 	for y_merged in ['0', '1', '2', '3']:
280 | 		if y_merged not in metrics:
281 | 			metrics[y_merged] = 0
282 | 
283 | 	metrics['TP'] = metrics['3']
284 | 	metrics['FP'] = metrics['2'] # prediction is 1, label is 0 (1*2 + 0 = 2)
285 | 	metrics['FN'] = metrics['1']
286 | 	metrics['TN'] = metrics['0']
287 | 	try:
288 | 		metrics['Precision'] = metrics['TP'] / (metrics['TP'] + metrics['FP'])
289 | 	except ZeroDivisionError:
290 | 		metrics['Precision'] = 0
291 | 	metrics['Recall'] = metrics['TP'] / (metrics['TP'] + metrics['FN'])
292 | 	metrics['F-Score'] = 2 * metrics['Precision'] * metrics['Recall'] /\
293 | 						 (metrics['Precision'] + metrics['Recall'])
294 | 	metrics['Accuracy'] = (metrics['TP'] + metrics['TN']) / \
295 | 						  (metrics['TP'] + metrics['TN'] + metrics['FP'] + metrics['FN'])
296 | 
297 | 	y_test = tf.convert_to_tensor(y_test, np.float32)  # Keras Bug
298 | 	y_prediction = tf.convert_to_tensor(y_prediction, np.float32) # Keras Bug
299 | 	loss = keras.backend.eval(keras.losses.binary_crossentropy(y_true=y_test, y_pred=y_prediction))
300 | 	metrics['Loss'] = np.average(loss)
301 | 
302 | 	return metrics
303 | 
304 | 
305 | def search_model(datafile_dict, test_file_list, x_dim):
306 | 
307 | 	if not test_file_list:
308 | 		print('Test File List is Empty!')
309 | 		return
310 | 
311 | 	with open('results.csvx', 'w') as result_file:
312 | 		headers = 'conv_depth,conv_filter,conv_kernel_width,conv_pool,lstm_units,'
313 | 		headers = headers + 'dense_depth,dense_units,dense_dropout,dense_relu_alpha,'
314 | 		headers = headers + 'loss,accuracy,F-score'
315 | 		result_file.write(headers + '\n')
316 | 
317 | 		for conv_depth in [3]:
318 | 			for conv_filter in [4,6,8,10,12]:
319 | 				for conv_kernel_width in [4, 3]:
320 | 					for conv_pool in [2, 3, 4, 5, 6]:
321 | 						for lstm_units in [16,14,12,8]:
322 | 							for dense_depth in [1,2,3,4,5,6,7,8]:
323 | 								for dense_units in [16,12,8]:
324 | 									for dense_dropout in [0.1, 0.3]:
325 | 										for dense_relu_alpha in [0.1]:
326 | 											hyper_p_dict = {
327 | 												"conv_depth" : conv_depth,
328 | 												"conv_filter" : conv_filter,
329 | 												"conv_kernel_width" : conv_kernel_width,
330 | 												"conv_pool" : conv_pool,
331 | 												"lstm_units" : lstm_units,
332 | 												"dense_depth" : dense_depth,
333 | 												"dense_units" : dense_units,
334 | 												"dense_dropout" : dense_dropout,
335 | 												"dense_relu_alpha" : dense_relu_alpha
336 | 											}
337 | 											if np.random.random() < 0.3:
338 | 												continue
339 | 											result_file.write(','.join(map(str, hyper_p_dict.values())))
340 | 											result_file.flush()
341 | 											model = create_model(x_dim, hyper_p_dict)
342 | 											train_model(model,
343 | 														datafile_dict,
344 | 														x_dim=x_dim,
345 | 														save_model=False,
346 | 														verbose=0)
347 | 											metrics = test_model(model, test_file_list, x_dim=x_dim)
348 | 											result_file.write(',' + str(metrics['Loss']) +
349 | 															  ',' + str(metrics['Accuracy']) +
350 | 															  ',' + str(metrics['F-Score']) + '\n')
351 | 											result_file.flush()
352 | 											keras_tf_backend.clear_session()
353 | 											del(model)
354 | 											processors = int(psutil.cpu_count() / 1.5)
355 | 											keras_tf_backend.set_session(get_session(0.5, processors))
356 | 											gc.collect()
357 | 
358 | 
359 | if __name__ == '__main__':
360 | 
361 | 	processors = int(psutil.cpu_count() / 1.5)
362 | 	keras_tf_backend.set_session(get_session(0.5, processors))
363 | 
364 | 	yesterday = datetime.datetime.today() + datetime.timedelta(days=-1)
365 | 	day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2)
366 | 	yesterday_string = yesterday.strftime("%Y%m%d")
367 | 	day_before_yesterday_string = day_before_yesterday.strftime("%Y%m%d")
368 | 	npy_dir = "./npy/"
369 | 	model_dir = "./models/"
370 | 	pad_string = "prepad"
371 | 
372 | 	model_category = "INV-SQL"
373 | 	model_type = "CDRNN-" + pad_string
374 | 	model_name = model_category + "-" + model_type
375 | 	model_filename = model_name + ".h5"
376 | 	
377 | 	model_path = os.path.join(model_dir, model_filename)
378 | 	model_backup_filename = model_filename + "." + yesterday_string
379 | 	model_backup_path = os.path.join(model_dir, model_backup_filename)
380 | 	
381 | 	model = create_model(1000)
382 | 
383 | 	payload_files = list(os.path.join(npy_dir, f)
384 | 						for f in os.listdir(npy_dir)
385 | 						if "_payload_" + pad_string in f
386 | 						and day_before_yesterday_string in f)
387 | 
388 | 	sql_files = list(os.path.join(npy_dir, f)
389 | 					for f in os.listdir(npy_dir)
390 | 					if "_sql_" + pad_string in f
391 | 					and yesterday_string not in f)
392 | 
393 | 	label_files = list(os.path.join(npy_dir, f)
394 | 						for f in os.listdir(npy_dir)
395 | 						if model_name in f)
396 | 
397 | 	if os.path.exists(model_path):
398 | 		shutil.copy(model_path, model_backup_path)
399 | 				   
400 | 	train_model(model, {'payload': payload_files, 'sql': sql_files, 'label': label_files},
401 | 				x_dim=1000, save_model=True, model_name=model_path, verbose=0)
402 | 
403 | 
404 | 
405 | 
406 | 


--------------------------------------------------------------------------------
/inv_aut/inv_aut_train.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import gc
  3 | import threading
  4 | import os
  5 | import shutil
  6 | from copy import deepcopy
  7 | 
  8 | import keras.backend.tensorflow_backend as keras_tf_backend
  9 | import keras.utils
 10 | import numpy as np
 11 | import psutil
 12 | import tensorflow as tf
 13 | from keras.layers import Conv1D, MaxPooling1D
 14 | from keras.layers import Dropout
 15 | from keras.layers import LSTM, Dense, Bidirectional, CuDNNLSTM
 16 | from keras.layers import LeakyReLU, BatchNormalization
 17 | from keras.models import Sequential
 18 | 
 19 | 
 20 | class DataSequence(keras.utils.Sequence):
 21 | 
 22 |     def _find_npy_size(self, filename):
 23 |         with open(filename, 'rb') as f:
 24 |             data = str(f.read(100)[50:])
 25 |             shape_index = data.find('shape')
 26 |             comma_index = data[shape_index:].find(',')
 27 | 
 28 |             return int(data[shape_index+9 : shape_index+comma_index])
 29 | 
 30 |     def _load_cache(self, label, idx):
 31 | 
 32 |         cache_end_index = self._cache_end_index_dict[label]
 33 |         cache_start_index = self._cache_start_index_dict[label]
 34 |         if (idx + 1) * self._batch_size_dict[label] <= cache_end_index:
 35 |             return
 36 | 
 37 |         threshold = self._cache_threshold_dict[label]
 38 |         batch_size = self._batch_size_dict[label]
 39 | 
 40 |         delete_count = int((idx - 64) * batch_size) - cache_start_index
 41 |         if delete_count > 0:
 42 |             self._cache_dict[label] = self._cache_dict[label][delete_count:]
 43 |             self._cache_start_index_dict[label] = self._cache_start_index_dict[label] + delete_count
 44 | 
 45 |         new_row_count = self._cache_dict[label].shape[0]
 46 |         dict_size = self._cache_dict[label].nbytes
 47 |         if new_row_count > 0:
 48 |             self._avg_size = float(dict_size / new_row_count)
 49 | 
 50 |         remaining_size = threshold - dict_size
 51 |         temp_array_filename_list = []
 52 |         while self._file_size_list_dict[label]:
 53 |             file_name = self._file_list_dict[label].pop(0)
 54 |             file_size = self._file_size_list_dict[label].pop(0)
 55 |             temp_array_filename_list.append(file_name)
 56 |             remaining_size = remaining_size - file_size * self._avg_size
 57 |             self._cache_end_index_dict[label] = self._cache_end_index_dict[label] + file_size
 58 | 
 59 |             if not self._file_size_list_dict[label]:
 60 |                 break
 61 |             if remaining_size <= self._file_size_list_dict[label][0] * self._avg_size:
 62 |                 break
 63 | 
 64 |         self._cache_dict[label] = np.concatenate(tuple([self._cache_dict[label]] +
 65 |                                                        list(np.load(x) for x in temp_array_filename_list)
 66 |                                                        )
 67 |                                                  )
 68 |         gc.collect()
 69 | 
 70 |         return
 71 | 
 72 |     def _initialize_objects(self):
 73 |         self._file_list_dict = deepcopy(self._backup_file_list_dict)
 74 |         self._cache_dict = dict()
 75 |         gc.collect()
 76 | 
 77 |         for label in self._file_list_dict:
 78 |             self._file_size_list_dict[label] = []
 79 |             self._cache_dict[label] = np.empty((0, self._x_dim+1), dtype=np.uint8)
 80 |             self._cache_start_index_dict[label] = 0
 81 |             self._cache_end_index_dict[label] = 0
 82 |             for filename in self._file_list_dict[label]:
 83 |                 npy_size = self._find_npy_size(filename)
 84 |                 self._file_size_list_dict[label].append(npy_size)
 85 | 
 86 |         for label in self._cache_dict:
 87 |             self._load_cache(label, 0)
 88 | 
 89 |     def __init__(self, file_list_dict, x_dim, batch_size=1024, mem_share=0.2):
 90 | 
 91 |         self._backup_file_list_dict = deepcopy(file_list_dict)
 92 |         self._batch_size = batch_size
 93 |         self._x_dim = x_dim
 94 |         self._avg_size = (x_dim + 1)  
 95 |         self._data_size = 0
 96 |         self._file_size_list_dict = dict()
 97 |         self._batch_size_dict = dict()
 98 |         self._cache_threshold_dict = dict()
 99 |         self._cache_start_index_dict = dict()
100 |         self._cache_end_index_dict = dict()
101 |         self._lock_dict = dict()
102 | 
103 |         mem = psutil.virtual_memory()
104 |         buffer = 0.25
105 | 
106 |         total_threshold = mem.available * mem_share * (1 - buffer)
107 | 
108 |         for label in file_list_dict:
109 |             self._batch_size_dict[label] = 0
110 |             self._lock_dict[label] = threading.Lock()
111 | 
112 |             for filename in file_list_dict[label]:
113 |                 npy_size = self._find_npy_size(filename)
114 |                 self._batch_size_dict[label] = self._batch_size_dict[label] + npy_size
115 |                 self._data_size = self._data_size + npy_size
116 | 
117 |         for label in self._batch_size_dict:
118 |             # _batch_size_dict is not the batch size at this point. It is the total row count of each label.
119 |             self._cache_threshold_dict[label] = int(total_threshold * self._batch_size_dict[label] / self._data_size)
120 | 
121 |             # Dividing with total data size / label size to get the batch size of each label
122 |             self._batch_size_dict[label] = float(self._batch_size_dict[label] * self._batch_size / self._data_size)
123 | 
124 |         self._initialize_objects()
125 | 
126 |     def __len__(self):
127 |         return int(np.ceil(self._data_size / self._batch_size))
128 | 
129 |     def __getitem__(self, idx):
130 | 
131 |         data = np.empty((0, self._x_dim+1))
132 |         for label in self._cache_end_index_dict:
133 | 
134 |             start_index = int(idx * self._batch_size_dict[label])
135 |             end_index = int((idx + 1) * self._batch_size_dict[label])
136 | 
137 |             with self._lock_dict[label]:
138 |                 if end_index > self._cache_end_index_dict[label]:
139 |                     self._load_cache(label, idx)
140 |                 cache_start_index = start_index - self._cache_start_index_dict[label]
141 | 
142 |             if cache_start_index < 0:
143 |                 cache_start_index = 0
144 | 
145 |             cache_end_index = cache_start_index + (end_index - start_index)
146 |             data = np.concatenate((data, self._cache_dict[label][cache_start_index:cache_end_index]))
147 | 
148 |         data = data[:self._batch_size]
149 |         np.random.shuffle(data)
150 | 
151 |         train_x = data[:, :-1].reshape(data.shape[0], self._x_dim, 1)
152 |         train_y = data[:, [-1]].reshape(data.shape[0], 1)
153 | 
154 |         train_x = (train_x - 128.0) / -128.0
155 | 
156 |         return (train_x, train_y)
157 | 
158 |     def on_epoch_end(self):
159 |         self._initialize_objects()
160 | 
161 | 
162 | def get_session(gpu_share=0.2, threads=2):
163 |     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_share)
164 |     config = tf.ConfigProto(inter_op_parallelism_threads=threads,
165 |                             intra_op_parallelism_threads=threads,
166 |                             gpu_options=gpu_options)
167 |     config.gpu_options.allow_growth = True
168 | 
169 |     return tf.Session(config=config)
170 | 
171 | 
172 | def initialize_hyperparameters(parameter_dict):
173 | 
174 |     if parameter_dict is None:
175 |         # Default Values
176 |         parameter_tuple = (2, 12, 4, 5, 16, 8, 12, 0.1, 0.1)
177 |     else:
178 |         parameter_tuple = (
179 |             parameter_dict["conv_depth"],
180 |             parameter_dict["conv_filter"],
181 |             parameter_dict["conv_kernel_width"],
182 |             parameter_dict["conv_pool"],
183 |             parameter_dict["lstm_units"],
184 |             parameter_dict["dense_depth"],
185 |             parameter_dict["dense_units"],
186 |             parameter_dict["dense_dropout"],
187 |             parameter_dict["dense_relu_alpha"]
188 |         )
189 |     return parameter_tuple
190 | 
191 | 
192 | def create_model(input_dim, hyperparameter_dict=None):
193 | 
194 |     (conv_depth,
195 |      conv_filter,
196 |      conv_kernel_width,
197 |      conv_pool,
198 |      lstm_units,
199 |      dense_depth,
200 |      dense_units,
201 |      dense_dropout,
202 |      dense_relu_alpha
203 |     ) = initialize_hyperparameters(hyperparameter_dict)
204 | 
205 |     model = Sequential()
206 | 
207 |     # CNN Layer
208 |     for i in range(conv_depth):
209 |         conv_filter_size = conv_filter * pow(conv_pool, i)
210 |         if i == 0:
211 |             model.add(Conv1D(conv_filter_size,
212 |                              conv_kernel_width,
213 |                              padding='same',
214 |                              activation='relu',
215 |                              input_shape=(input_dim, 1)))
216 |         else:
217 |             model.add(Conv1D(conv_filter_size,
218 |                              conv_kernel_width,
219 |                              padding='same',
220 |                              activation='relu'))
221 |         model.add(MaxPooling1D(pool_size=conv_pool, padding='same'))
222 |         model.add(BatchNormalization())
223 | 
224 |     # RNN Layer
225 |     if conv_depth > 0:
226 |         (_, lstm_timesteps, lstm_features) = model.output_shape
227 |         lstm_input_shape = (lstm_timesteps, lstm_features) # Get input from CNN
228 |     else:
229 |         lstm_input_shape = (input_dim, 1) # Starts with RNN
230 | 
231 |     #model.add(Bidirectional(LSTM(lstm_units, return_sequences=True), input_shape=lstm_input_shape))
232 |     #model.add(Bidirectional(LSTM(lstm_units)))
233 |     model.add(LSTM(lstm_units, return_sequences=True, input_shape=lstm_input_shape))
234 |     #model.add(CuDNNLSTM(lstm_units, return_sequences=True, input_shape=lstm_input_shape))
235 |     #model.add(CuDNNLSTM(lstm_units, return_sequences=True))
236 |     #model.add(CuDNNLSTM(lstm_units))
237 |     #model.add(CuDNNLSTM(lstm_units, input_shape=lstm_input_shape))
238 |     #model.add(Bidirectional(CuDNNLSTM(lstm_units), input_shape=lstm_input_shape))
239 |     model.add(Bidirectional(LSTM(lstm_units)))
240 |     #model.add(Bidirectional(CuDNNLSTM(lstm_units)))
241 | 
242 |     # DNN Layer
243 |     for _ in range(dense_depth):
244 |         model.add(Dense(dense_units))
245 |         model.add(Dropout(dense_dropout))
246 |         model.add(LeakyReLU(dense_relu_alpha))
247 |         model.add(BatchNormalization())
248 | 
249 |     # Output Layer
250 |     model.add(Dense(1, activation='sigmoid'))
251 | 
252 |     # print(model.summary())
253 |     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
254 | 
255 |     return model
256 | 
257 | 
258 | def train_model(model, datafile_dict, x_dim, save_model=False, model_name='default.h5', verbose=0):
259 | 
260 |     processors = int(psutil.cpu_count() / 1.5)
261 |     generator = DataSequence(datafile_dict, x_dim=x_dim, batch_size=8192, mem_share=0.2)
262 |     model.fit_generator(generator=generator,
263 |                         epochs=5,
264 |                         verbose=verbose,
265 |                         shuffle=False,
266 |                         workers=processors
267 |                         )
268 | 
269 |     if save_model:
270 |         model.save(model_name)
271 | 
272 | 
273 | def test_model(model, test_file_list, x_dim):
274 | 
275 |     data = np.concatenate(tuple(
276 |         list(np.load(filename) for filename in test_file_list)
277 |     ))
278 | 
279 |     test_size = data.shape[0]
280 | 
281 |     x_test = np.array(data[:, :-1])
282 |     x_test = (x_test - 128.0) / -128.0
283 |     x_test = x_test.reshape(test_size, x_dim, 1) # 1000 characters at a time, 1 channel
284 |     y_test = data[:, [-1]].reshape(test_size, 1)
285 |     y_prediction = model.predict(x=x_test,
286 |                                  batch_size=4096,
287 |                                  verbose=0)
288 | 
289 |     y_merged = (y_prediction.round()*2 + y_test).flatten()
290 |     value, counts = np.unique(y_merged, return_counts=True)
291 |     value_str = list(map(lambda x: str(int(x)), value))
292 |     metrics = dict(zip(value_str, counts))
293 | 
294 |     for y_merged in ['0', '1', '2', '3']:
295 |         if y_merged not in metrics:
296 |             metrics[y_merged] = 0
297 | 
298 |     metrics['TP'] = metrics['3']
299 |     metrics['FP'] = metrics['2'] # prediction is 1, label is 0 (1*2 + 0 = 2)
300 |     metrics['FN'] = metrics['1']
301 |     metrics['TN'] = metrics['0']
302 |     metrics['Precision'] = metrics['TP'] / (metrics['TP'] + metrics['FP'])
303 |     metrics['Recall'] = metrics['TP'] / (metrics['TP'] + metrics['FN'])
304 |     metrics['F-Score'] = 2 * metrics['Precision'] * metrics['Recall'] /\
305 |                          (metrics['Precision'] + metrics['Recall'])
306 |     metrics['Accuracy'] = (metrics['TP'] + metrics['TN']) / \
307 |                           (metrics['TP'] + metrics['TN'] + metrics['FP'] + metrics['FN'])
308 | 
309 |     y_test = tf.convert_to_tensor(y_test, np.float32)  # Keras Bug
310 |     y_prediction = tf.convert_to_tensor(y_prediction, np.float32) # Keras Bug
311 |     loss = keras.backend.eval(keras.losses.binary_crossentropy(y_true=y_test, y_pred=y_prediction))
312 |     metrics['Loss'] = np.average(loss)
313 | 
314 |     return metrics
315 | 
316 | 
317 | def search_model(datafile_dict, test_file_list, x_dim):
318 | 
319 |     if not test_file_list:
320 |         print('Test File List is Empty!')
321 |         return
322 | 
323 |     with open('results.csvx', 'w') as result_file:
324 |         headers = 'conv_depth,conv_filter,conv_kernel_width,conv_pool,lstm_units,'
325 |         headers = headers + 'dense_depth,dense_units,dense_dropout,dense_relu_alpha,'
326 |         headers = headers + 'loss,accuracy,F-score'
327 |         result_file.write(headers + '\n')
328 | 
329 |         for conv_depth in [1, 2, 3]:
330 |             for conv_filter in [4, 6]:
331 |                 for conv_kernel_width in [4, 3]:
332 |                     for conv_pool in [2, 3]:
333 |                         for lstm_units in [16, 8]:
334 |                             for dense_depth in [1]:
335 |                                 for dense_units in [16, 8]:
336 |                                     for dense_dropout in [0.1, 0.3]:
337 |                                         for dense_relu_alpha in [0.1]:
338 |                                             hyper_p_dict = {
339 |                                                 "conv_depth" : conv_depth,
340 |                                                 "conv_filter" : conv_filter,
341 |                                                 "conv_kernel_width" : conv_kernel_width,
342 |                                                 "conv_pool" : conv_pool,
343 |                                                 "lstm_units" : lstm_units,
344 |                                                 "dense_depth" : dense_depth,
345 |                                                 "dense_units" : dense_units,
346 |                                                 "dense_dropout" : dense_dropout,
347 |                                                 "dense_relu_alpha" : dense_relu_alpha
348 |                                             }
349 |                                             if np.random.random() < 0.3:
350 |                                                 continue
351 |                                             result_file.write(','.join(map(str, hyper_p_dict.values())))
352 |                                             result_file.flush()
353 |                                             model = create_model(x_dim, hyper_p_dict)
354 |                                             train_model(model,
355 |                                                         datafile_dict,
356 |                                                         x_dim=x_dim,
357 |                                                         save_model=False,
358 |                                                         verbose=0)
359 |                                             metrics = test_model(model, test_file_list, x_dim=x_dim)
360 |                                             result_file.write(',' + str(metrics['Loss']) +
361 |                                                               ',' + str(metrics['Accuracy']) +
362 |                                                               ',' + str(metrics['F-Score']) + '\n')
363 |                                             result_file.flush()
364 |                                             keras_tf_backend.clear_session()
365 |                                             del(model)
366 |                                             processors = int(psutil.cpu_count() / 1.5)
367 |                                             keras_tf_backend.set_session(get_session(0.5, processors))
368 |                                             gc.collect()
369 | 
370 | 
371 | if __name__ == '__main__':
372 | 
373 |     processors = int(psutil.cpu_count() / 1.5)
374 |     keras_tf_backend.set_session(get_session(0.5, processors))
375 | 
376 |     yesterday = datetime.datetime.today() + datetime.timedelta(days=-1)
377 |     day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2)
378 |     yesterday_string = yesterday.strftime("%Y%m%d")
379 |     day_before_yesterday_string = day_before_yesterday.strftime("%Y%m%d")
380 |     npy_dir = "./npy/"
381 |     model_dir = "./models/"
382 |     pad_string = "prepad"
383 | 
384 |     model_category = "INV-AUT"
385 |     model_type = "CRDNN-" + pad_string
386 |     model_name = model_category + "-" + model_type
387 |     model_filename = model_name + ".h5"
388 | 
389 |     model_path = os.path.join(model_dir, model_filename)
390 |     model_backup_filename = model_filename + "." + yesterday_string
391 |     model_backup_path = os.path.join(model_dir, model_backup_filename)
392 | 
393 |     model = create_model(1000)
394 | 
395 |     payload_files = list(os.path.join(npy_dir, f)
396 |                          for f in os.listdir(npy_dir)
397 |                          if "_payload_" + pad_string in f
398 |                          and day_before_yesterday_string in f)
399 | 
400 |     aut_files = list(os.path.join(npy_dir, f)
401 |                      for f in os.listdir(npy_dir)
402 |                      if "_aut_" + pad_string in f
403 |                      and yesterday_string not in f)
404 | 
405 |     label_files = list(os.path.join(npy_dir, f)
406 |                        for f in os.listdir(npy_dir)
407 |                        if model_name + "_" + pad_string in f)
408 | 
409 |     if os.path.exists(model_path):
410 |         shutil.copy(model_path, model_backup_path)
411 | 
412 |     train_model(model, {'payload': payload_files, 'aut': aut_files, 'label': label_files},
413 |                 x_dim=1000, save_model=True, model_name=model_path, verbose=0)
414 | 
415 | 


--------------------------------------------------------------------------------
/inv_app/inv_app_crd_train.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import gc
  3 | import threading
  4 | import os
  5 | import shutil
  6 | from copy import deepcopy
  7 | 
  8 | import keras.backend.tensorflow_backend as keras_tf_backend
  9 | import keras.utils
 10 | import numpy as np
 11 | import psutil
 12 | import tensorflow as tf
 13 | from keras.layers import Conv1D, MaxPooling1D
 14 | from keras.layers import Dropout
 15 | from keras.layers import LSTM, Dense, Bidirectional, CuDNNLSTM
 16 | from keras.layers import LeakyReLU, BatchNormalization
 17 | from keras.models import Sequential
 18 | #from keras.utils.vis_utils import plot_model
 19 | 
 20 | class DataSequence(keras.utils.Sequence):
 21 | 
 22 |     def _find_npy_size(self, filename):
 23 |         with open(filename, 'rb') as f:
 24 |             data = str(f.read(100)[50:])
 25 |             shape_index = data.find('shape')
 26 |             comma_index = data[shape_index:].find(',')
 27 | 
 28 |             return int(data[shape_index+9 : shape_index+comma_index])
 29 | 
 30 |     def _load_cache(self, label, idx):
 31 | 
 32 |         cache_end_index = self._cache_end_index_dict[label]
 33 |         cache_start_index = self._cache_start_index_dict[label]
 34 |         if (idx + 1) * self._batch_size_dict[label] <= cache_end_index:
 35 |             return
 36 | 
 37 |         threshold = self._cache_threshold_dict[label]
 38 |         batch_size = self._batch_size_dict[label]
 39 | 
 40 |         delete_count = int((idx - 64) * batch_size) - cache_start_index
 41 |         if delete_count > 0:
 42 |             self._cache_dict[label] = self._cache_dict[label][delete_count:]
 43 |             self._cache_start_index_dict[label] = self._cache_start_index_dict[label] + delete_count
 44 | 
 45 |         new_row_count = self._cache_dict[label].shape[0]
 46 |         dict_size = self._cache_dict[label].nbytes
 47 |         if new_row_count > 0:
 48 |             self._avg_size = float(dict_size / new_row_count)
 49 | 
 50 |         remaining_size = threshold - dict_size
 51 |         temp_array_filename_list = []
 52 |         while self._file_size_list_dict[label]:
 53 |             file_name = self._file_list_dict[label].pop(0)
 54 |             file_size = self._file_size_list_dict[label].pop(0)
 55 |             temp_array_filename_list.append(file_name)
 56 |             remaining_size = remaining_size - file_size * self._avg_size
 57 |             self._cache_end_index_dict[label] = self._cache_end_index_dict[label] + file_size
 58 | 
 59 |             if not self._file_size_list_dict[label]:
 60 |                 break
 61 |             if remaining_size <= self._file_size_list_dict[label][0] * self._avg_size:
 62 |                 break
 63 | 
 64 |         self._cache_dict[label] = np.concatenate(tuple([self._cache_dict[label]] +
 65 |                                                        list(np.load(x) for x in temp_array_filename_list)
 66 |                                                        )
 67 |                                                  )
 68 |         gc.collect()
 69 | 
 70 |         return
 71 | 
 72 |     def _initialize_objects(self):
 73 |         self._file_list_dict = deepcopy(self._backup_file_list_dict)
 74 |         self._cache_dict = dict()
 75 |         gc.collect()
 76 | 
 77 |         for label in self._file_list_dict:
 78 |             self._file_size_list_dict[label] = []
 79 |             self._cache_dict[label] = np.empty((0, self._x_dim+1), dtype=np.uint8)
 80 |             self._cache_start_index_dict[label] = 0
 81 |             self._cache_end_index_dict[label] = 0
 82 |             for filename in self._file_list_dict[label]:
 83 |                 npy_size = self._find_npy_size(filename)
 84 |                 self._file_size_list_dict[label].append(npy_size)
 85 | 
 86 |         for label in self._cache_dict:
 87 |             self._load_cache(label, 0)
 88 | 
 89 |     def __init__(self, file_list_dict, x_dim, batch_size=1024, mem_share=0.2):
 90 | 
 91 |         self._backup_file_list_dict = deepcopy(file_list_dict)
 92 |         self._batch_size = batch_size
 93 |         self._x_dim = x_dim
 94 |         self._avg_size = (x_dim + 1)  
 95 |         self._data_size = 0
 96 |         self._file_size_list_dict = dict()
 97 |         self._batch_size_dict = dict()
 98 |         self._cache_threshold_dict = dict()
 99 |         self._cache_start_index_dict = dict()
100 |         self._cache_end_index_dict = dict()
101 |         self._lock_dict = dict()
102 | 
103 |         mem = psutil.virtual_memory()
104 |         buffer = 0.25
105 | 
106 |         total_threshold = mem.available * mem_share * (1 - buffer)
107 | 
108 |         for label in file_list_dict:
109 |             self._batch_size_dict[label] = 0
110 |             self._lock_dict[label] = threading.Lock()
111 | 
112 |             for filename in file_list_dict[label]:
113 |                 npy_size = self._find_npy_size(filename)
114 |                 self._batch_size_dict[label] = self._batch_size_dict[label] + npy_size
115 |                 self._data_size = self._data_size + npy_size
116 | 
117 |         for label in self._batch_size_dict:
118 |             # _batch_size_dict is not the batch size at this point. It is the total row count of each label.
119 |             self._cache_threshold_dict[label] = int(total_threshold * self._batch_size_dict[label] / self._data_size)
120 | 
121 |             # Dividing with total data size / label size to get the batch size of each label
122 |             self._batch_size_dict[label] = float(self._batch_size_dict[label] * self._batch_size / self._data_size)
123 | 
124 |         self._initialize_objects()
125 | 
126 |     def __len__(self):
127 |         return int(np.ceil(self._data_size / self._batch_size))
128 | 
129 |     def __getitem__(self, idx):
130 | 
131 |         data = np.empty((0, self._x_dim+1))
132 |         for label in self._cache_end_index_dict:
133 | 
134 |             start_index = int(idx * self._batch_size_dict[label])
135 |             end_index = int((idx + 1) * self._batch_size_dict[label])
136 | 
137 |             with self._lock_dict[label]:
138 |                 if end_index > self._cache_end_index_dict[label]:
139 |                     self._load_cache(label, idx)
140 |                 cache_start_index = start_index - self._cache_start_index_dict[label]
141 | 
142 |             if cache_start_index < 0:
143 |                 cache_start_index = 0
144 | 
145 |             cache_end_index = cache_start_index + (end_index - start_index)
146 |             data = np.concatenate((data, self._cache_dict[label][cache_start_index:cache_end_index]))
147 | 
148 |         data = data[:self._batch_size]
149 |         np.random.shuffle(data)
150 | 
151 |         train_x = data[:, :-1].reshape(data.shape[0], self._x_dim, 1)
152 |         train_y = data[:, [-1]].reshape(data.shape[0], 1)
153 | 
154 |         train_x = (train_x - 128.0) / -128.0
155 | 
156 |         return (train_x, train_y)
157 | 
158 |     def on_epoch_end(self):
159 |         self._initialize_objects()
160 | 
161 | 
162 | def get_session(gpu_share=0.2, threads=2):
163 |     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_share)
164 |     config = tf.ConfigProto(inter_op_parallelism_threads=threads,
165 |                             intra_op_parallelism_threads=threads,
166 |                             gpu_options=gpu_options)
167 |     config.gpu_options.allow_growth = True
168 | 
169 |     return tf.Session(config=config)
170 | 
171 | 
172 | def initialize_hyperparameters(parameter_dict):
173 | 
174 |     if parameter_dict is None:
175 |         # Default Values
176 |         parameter_tuple = (2, 12, 4, 5, 16, 8, 12, 0.1, 0.1)
177 |     else:
178 |         parameter_tuple = (
179 |             parameter_dict["conv_depth"],
180 |             parameter_dict["conv_filter"],
181 |             parameter_dict["conv_kernel_width"],
182 |             parameter_dict["conv_pool"],
183 |             parameter_dict["lstm_units"],
184 |             parameter_dict["dense_depth"],
185 |             parameter_dict["dense_units"],
186 |             parameter_dict["dense_dropout"],
187 |             parameter_dict["dense_relu_alpha"]
188 |         )
189 |     return parameter_tuple
190 | 
191 | 
192 | def create_model(input_dim, hyperparameter_dict=None):
193 | 
194 |     (conv_depth,
195 |      conv_filter,
196 |      conv_kernel_width,
197 |      conv_pool,
198 |      lstm_units,
199 |      dense_depth,
200 |      dense_units,
201 |      dense_dropout,
202 |      dense_relu_alpha
203 |     ) = initialize_hyperparameters(hyperparameter_dict)
204 | 
205 |     model = Sequential()
206 | 
207 |     # CNN Layer
208 |     for i in range(conv_depth):
209 |         conv_filter_size = conv_filter * pow(conv_pool, i)
210 |         if i == 0:
211 |             model.add(Conv1D(conv_filter_size,
212 |                              conv_kernel_width,
213 |                              padding='same',
214 |                              activation='relu',
215 |                              input_shape=(input_dim, 1)))
216 |         else:
217 |             model.add(Conv1D(conv_filter_size,
218 |                              conv_kernel_width,
219 |                              padding='same',
220 |                              activation='relu'))
221 |         model.add(MaxPooling1D(pool_size=conv_pool, padding='same'))
222 |         model.add(BatchNormalization())
223 | 
224 |     # RNN Layer
225 |     if conv_depth > 0:
226 |         (_, lstm_timesteps, lstm_features) = model.output_shape
227 |         lstm_input_shape = (lstm_timesteps, lstm_features) # Get input from CNN
228 |     else:
229 |         lstm_input_shape = (input_dim, 1) # Starts with RNN
230 | 
231 |     #model.add(Bidirectional(LSTM(lstm_units, return_sequences=True), input_shape=lstm_input_shape))
232 |     #model.add(Bidirectional(LSTM(lstm_units)))
233 |     model.add(LSTM(lstm_units, return_sequences=True, input_shape=lstm_input_shape))
234 |     #model.add(CuDNNLSTM(lstm_units, return_sequences=True, input_shape=lstm_input_shape))
235 |     #model.add(CuDNNLSTM(lstm_units, return_sequences=True))
236 |     #model.add(CuDNNLSTM(lstm_units))
237 |     #model.add(CuDNNLSTM(lstm_units, input_shape=lstm_input_shape))
238 |     #model.add(Bidirectional(CuDNNLSTM(lstm_units), input_shape=lstm_input_shape))
239 |     model.add(Bidirectional(LSTM(lstm_units)))
240 |     #model.add(Bidirectional(CuDNNLSTM(lstm_units)))
241 | 
242 |     # DNN Layer
243 |     for _ in range(dense_depth):
244 |         model.add(Dense(dense_units))
245 |         model.add(Dropout(dense_dropout))
246 |         model.add(LeakyReLU(dense_relu_alpha))
247 |         model.add(BatchNormalization())
248 | 
249 |     # Output Layer
250 |     model.add(Dense(1, activation='sigmoid'))
251 | 
252 |     print(model.summary())
253 |     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
254 |     #plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
255 |     return model
256 | 
257 | 
258 | def train_model(model, datafile_dict, x_dim, save_model=False, model_name='default.h5', verbose=0):
259 | 
260 |     processors = int(psutil.cpu_count() / 1.5)
261 |     generator = DataSequence(datafile_dict, x_dim=x_dim, batch_size=8192, mem_share=0.2)
262 |     model.fit_generator(generator=generator,
263 |                         epochs=5,
264 |                         verbose=verbose,
265 |                         shuffle=False,
266 |                         workers=processors
267 |                         )
268 | 
269 |     if save_model:
270 |         model.save(model_name)
271 | 
272 | 
273 | def test_model(model, test_file_list, x_dim):
274 | 
275 |     data = np.concatenate(tuple(
276 |         list(np.load(filename) for filename in test_file_list)
277 |     ))
278 | 
279 |     test_size = data.shape[0]
280 | 
281 |     x_test = np.array(data[:, :-1])
282 |     x_test = (x_test - 128.0) / -128.0
283 |     x_test = x_test.reshape(test_size, x_dim, 1) # 1000 characters at a time, 1 channel
284 |     y_test = data[:, [-1]].reshape(test_size, 1)
285 |     y_prediction = model.predict(x=x_test,
286 |                                  batch_size=4096,
287 |                                  verbose=0)
288 | 
289 |     y_merged = (y_prediction.round()*2 + y_test).flatten()
290 |     value, counts = np.unique(y_merged, return_counts=True)
291 |     value_str = list(map(lambda x: str(int(x)), value))
292 |     metrics = dict(zip(value_str, counts))
293 | 
294 |     for y_merged in ['0', '1', '2', '3']:
295 |         if y_merged not in metrics:
296 |             metrics[y_merged] = 0
297 | 
298 |     metrics['TP'] = metrics['3']
299 |     metrics['FP'] = metrics['2'] # prediction is 1, label is 0 (1*2 + 0 = 2)
300 |     metrics['FN'] = metrics['1']
301 |     metrics['TN'] = metrics['0']
302 |     metrics['Precision'] = metrics['TP'] / (metrics['TP'] + metrics['FP'])
303 |     metrics['Recall'] = metrics['TP'] / (metrics['TP'] + metrics['FN'])
304 |     metrics['F-Score'] = 2 * metrics['Precision'] * metrics['Recall'] /\
305 |                          (metrics['Precision'] + metrics['Recall'])
306 |     metrics['Accuracy'] = (metrics['TP'] + metrics['TN']) / \
307 |                           (metrics['TP'] + metrics['TN'] + metrics['FP'] + metrics['FN'])
308 | 
309 |     y_test = tf.convert_to_tensor(y_test, np.float32)  # Keras Bug
310 |     y_prediction = tf.convert_to_tensor(y_prediction, np.float32) # Keras Bug
311 |     loss = keras.backend.eval(keras.losses.binary_crossentropy(y_true=y_test, y_pred=y_prediction))
312 |     metrics['Loss'] = np.average(loss)
313 | 
314 |     return metrics
315 | 
316 | 
317 | def search_model(datafile_dict, test_file_list, x_dim):
318 | 
319 |     if not test_file_list:
320 |         print('Test File List is Empty!')
321 |         return
322 | 
323 |     with open('results.csvx', 'w') as result_file:
324 |         headers = 'conv_depth,conv_filter,conv_kernel_width,conv_pool,lstm_units,'
325 |         headers = headers + 'dense_depth,dense_units,dense_dropout,dense_relu_alpha,'
326 |         headers = headers + 'loss,accuracy,F-score'
327 |         result_file.write(headers + '\n')
328 | 
329 |         for conv_depth in [1, 2, 3]:
330 |             for conv_filter in [4, 6]:
331 |                 for conv_kernel_width in [4, 3]:
332 |                     for conv_pool in [2, 3]:
333 |                         for lstm_units in [16, 8]:
334 |                             for dense_depth in [1]:
335 |                                 for dense_units in [16, 8]:
336 |                                     for dense_dropout in [0.1, 0.3]:
337 |                                         for dense_relu_alpha in [0.1]:
338 |                                             hyper_p_dict = {
339 |                                                 "conv_depth" : conv_depth,
340 |                                                 "conv_filter" : conv_filter,
341 |                                                 "conv_kernel_width" : conv_kernel_width,
342 |                                                 "conv_pool" : conv_pool,
343 |                                                 "lstm_units" : lstm_units,
344 |                                                 "dense_depth" : dense_depth,
345 |                                                 "dense_units" : dense_units,
346 |                                                 "dense_dropout" : dense_dropout,
347 |                                                 "dense_relu_alpha" : dense_relu_alpha
348 |                                             }
349 |                                             if np.random.random() < 0.3:
350 |                                                 continue
351 |                                             result_file.write(','.join(map(str, hyper_p_dict.values())))
352 |                                             result_file.flush()
353 |                                             model = create_model(x_dim, hyper_p_dict)
354 |                                             train_model(model,
355 |                                                         datafile_dict,
356 |                                                         x_dim=x_dim,
357 |                                                         save_model=False,
358 |                                                         verbose=0)
359 |                                             metrics = test_model(model, test_file_list, x_dim=x_dim)
360 |                                             result_file.write(',' + str(metrics['Loss']) +
361 |                                                               ',' + str(metrics['Accuracy']) +
362 |                                                               ',' + str(metrics['F-Score']) + '\n')
363 |                                             result_file.flush()
364 |                                             keras_tf_backend.clear_session()
365 |                                             del(model)
366 |                                             processors = int(psutil.cpu_count() / 1.5)
367 |                                             keras_tf_backend.set_session(get_session(0.5, processors))
368 |                                             gc.collect()
369 | 
370 | 
371 | if __name__ == '__main__':
372 | 
373 |     processors = int(psutil.cpu_count() / 1.5)
374 |     keras_tf_backend.set_session(get_session(0.5, processors))
375 | 
376 |     yesterday = datetime.datetime.today() + datetime.timedelta(days=-1)
377 |     day_before_yesterday = datetime.datetime.today() + datetime.timedelta(days=-2)
378 |     yesterday_string = yesterday.strftime("%Y%m%d")
379 |     day_before_yesterday_string = day_before_yesterday.strftime("%Y%m%d")
380 |     npy_dir = "./npy/"
381 |     model_dir = "./models/"
382 |     pad_string = "prepad"
383 | 
384 |     model_category = "INV-APP"
385 |     model_type = "CRDNN-" + pad_string
386 |     model_name = model_category + "-" + model_type
387 |     model_filename = model_name + ".h5"
388 | 
389 |     model_path = os.path.join(model_dir, model_filename)
390 |     model_backup_filename = model_filename + "." + yesterday_string
391 |     model_backup_path = os.path.join(model_dir, model_backup_filename)
392 | 
393 |     model = create_model(1000)
394 | 
395 |     payload_files = list(os.path.join(npy_dir, f)
396 |                          for f in os.listdir(npy_dir)
397 |                          if "_payload_" + pad_string in f
398 |                          and day_before_yesterday_string in f)
399 | 
400 |     app_files = list(os.path.join(npy_dir, f)
401 |                      for f in os.listdir(npy_dir)
402 |                      if "_app_" + pad_string in f
403 |                      and yesterday_string not in f)
404 | 
405 |     label_files = list(os.path.join(npy_dir, f)
406 |                        for f in os.listdir(npy_dir)
407 |                        if model_name + "_" + pad_string in f)
408 | 
409 |     if os.path.exists(model_path):
410 |         shutil.copy(model_path, model_backup_path)
411 | 
412 |     #train_model(model, {'payload': payload_files, 'app': app_files, 'label': label_files}, x_dim=1000, save_model=True, model_name=model_path, verbose=0)
413 |     train_model(model, {'payload': payload_files, 'app': app_files, 'label': label_files}, x_dim=1000, save_model=True, model_name='INV-APP-CRDNN.h5', verbose=0)
414 | 	
415 | 	
416 | 	#test_model(model, test_file_list, x_dim=1000)
417 | 		
418 | 		
419 | 	#search_model({'payload': payload_files, 'app': app_files, 'label': label_files}, test_file_list, x_dim=1000)
420 | 
421 | 	
422 | 
423 | 
424 | 


--------------------------------------------------------------------------------