├── .gitattributes ├── Data ├── Bro log from Threatglass │ ├── HTML_Bro_log_1 │ │ ├── app_stats.html │ │ ├── conn.html │ │ ├── dhcp.html │ │ ├── dns.html │ │ ├── dpd.html │ │ ├── files.html │ │ ├── ftp.html │ │ ├── http.html │ │ ├── irc.html │ │ ├── loaded_scripts.html │ │ ├── notice.html │ │ ├── packet_filter.html │ │ ├── ssl.html │ │ └── weird.html │ ├── HTML_Bro_log_2 │ │ ├── app_stats.html │ │ ├── conn.html │ │ ├── dhcp.html │ │ ├── dns.html │ │ ├── dpd.html │ │ ├── files.html │ │ ├── ftp.html │ │ ├── http.html │ │ ├── irc.html │ │ ├── loaded_scripts.html │ │ ├── notice.html │ │ ├── packet_filter.html │ │ ├── ssl.html │ │ └── weird.html │ └── HTML_Bro_log_3 │ │ ├── app_stats.html │ │ ├── conn.html │ │ ├── dhcp.html │ │ ├── dns.html │ │ ├── dpd.html │ │ ├── files.html │ │ ├── http.html │ │ ├── irc.html │ │ ├── loaded_scripts.html │ │ ├── notice.html │ │ ├── packet_filter.html │ │ ├── ssl.html │ │ └── weird.html ├── Bro logs from Threatglass datasets │ └── Bro logs from Threatglass datasets.txt ├── PE malware dataset description │ ├── OPCleaver.html │ ├── VirusShare.html │ └── Zeus.html ├── PE malware datasets │ └── PE malware datasets.txt ├── System datasets description │ └── auth.html ├── System datasets │ └── System datasets.txt ├── network datasets description │ ├── dhcp.html │ ├── dns.html │ ├── files.html │ ├── ftp.html │ ├── http.html │ ├── notice.html │ ├── smtp.html │ ├── ssh.html │ ├── ssl.html │ ├── tunnel.html │ └── weird.html └── network datasets │ └── network datasets.txt ├── Data_analysis ├── Bro Logs from Threatglass │ ├── Part 1 │ │ ├── app_stats analysis.ipynb │ │ ├── conn analysis.ipynb │ │ ├── dhcp analysis.ipynb │ │ ├── dns analysis.ipynb │ │ ├── dpd analysis.ipynb │ │ ├── files analysis.ipynb │ │ ├── ftp analysis.ipynb │ │ ├── http analysis.ipynb │ │ ├── irc analysis.ipynb │ │ ├── loaded_scripts analysis.ipynb │ │ ├── notice analysis.ipynb │ │ ├── packet_filter analysis.ipynb │ │ ├── ssl analysis.ipynb │ │ └── weird analysis.ipynb │ ├── Part 2 │ │ ├── app_stats analysis.ipynb │ │ ├── conn analysis.ipynb │ │ ├── dhcpanalysis.ipynb │ │ ├── dns analysis.ipynb │ │ ├── dpd analysis.ipynb │ │ ├── files analysis.ipynb │ │ ├── ftp analysis.ipynb │ │ ├── http analysis.ipynb │ │ ├── irc analysis.ipynb │ │ ├── loaded_scripts analysis.ipynb │ │ ├── notice analysis.ipynb │ │ ├── packet_filter analysis.ipynb │ │ ├── ssl analysis.ipynb │ │ └── weird analysis.ipynb │ └── Part 3 │ │ ├── app_stats analysis.ipynb │ │ ├── conn analysis.ipynb │ │ ├── dhcp analysis.ipynb │ │ ├── dns analysis.ipynb │ │ ├── dpd analysis.ipynb │ │ ├── files analysis.ipynb │ │ ├── http analysis.ipynb │ │ ├── irc analysis.ipynb │ │ ├── loaded_scripts analysis.ipynb │ │ ├── notice analysis.ipynb │ │ ├── packet_filter analysis.ipynb │ │ ├── ssl analysis.ipynb │ │ └── weird analysis.ipynb ├── Network analysis │ ├── Dhcp_analysis_practice_2.ipynb │ ├── dhcp analysis.ipynb │ ├── dns analysis.ipynb │ ├── ftp analysis.ipynb │ ├── notice analysis.ipynb │ ├── smtp analysis.ipynb │ ├── ssh analysis.ipynb │ ├── ssl analysis.ipynb │ ├── tunnel analysis.ipynb │ └── weird analysis.ipynb ├── PE malware analysis │ ├── Malware_analysis_practice.ipynb │ ├── OP Cleaver Analysis.ipynb │ ├── VirusShare Analysis-checkpoint.ipynb │ └── Zeus Analysis.ipynb └── System analysis │ └── auth analysis.ipynb ├── Machine_learning_practice └── machine_learning.py ├── README.md └── Scripts ├── LogToCsv.py ├── LogtoCsvConverter.py ├── NetworkLogToCSV.py ├── System_Squid_LogToCSV.py ├── featureExtraction.py └── html_Generator.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.csv filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/app_stats.html: -------------------------------------------------------------------------------- 1 | Home

APP_STATS

Download: app_stats Zip File

Abstract

Source

Mike Sconzo

Security Repository

Secrepo.com

Dataset Information

Attribute Information

Relevant Papers

Bro Logs http://gauss.ececs.uc.edu/Courses/c6055/pdf/bro_log_vars.pdf

Neise, Patrick. "Intrusion Detection Through Relationship Analysis". Oct 2016 https://www.sans.org/reading-room/whitepapers/detection/intrusion-detection-relationship-analysis-37352

Frances Bernadette C. De Ocampo, Trisha Mari L. Del Castillo, Miguel Alberto N. Gomez. "AUTOMATED SIGNATURE CREATOR FOR A SIGNATURE BASED INTRUSION DETECTION SYSTEM WITH NETWORK ATTACK DETECTION CAPABILITIES". 2013 http://sdiwc.net/digital-library/automated-signature-creator-for-a-signature-based-intrusion-detection-system-with-network-attack-detection-capabilities-pancakes.html

Associate Data Science Notebook

https://github.com/cyberdefenders/MachineLearning/blob/master/Data_analysis/Bro%20Logs%20from%20Threatglass/Part%201/app_stats%20analysis.ipynb -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/conn.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_1/conn.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/dhcp.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_1/dhcp.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/dns.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_1/dns.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/dpd.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_1/dpd.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/files.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_1/files.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/ftp.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_1/ftp.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/http.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_1/http.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/irc.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_1/irc.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/loaded_scripts.html: -------------------------------------------------------------------------------- 1 | Home

LOADED_SCRIPTS

Download: loaded_scripts Zip File

Abstract

Source

Mike Sconzo

Security Repository

Secrepo.com

Dataset Information

Log the loaded scripts.

Attribute Information

Relevant Papers

Bro Logs http://gauss.ececs.uc.edu/Courses/c6055/pdf/bro_log_vars.pdf

Neise, Patrick. "Intrusion Detection Through Relationship Analysis". Oct 2016 https://www.sans.org/reading-room/whitepapers/detection/intrusion-detection-relationship-analysis-37352

Frances Bernadette C. De Ocampo, Trisha Mari L. Del Castillo, Miguel Alberto N. Gomez. "AUTOMATED SIGNATURE CREATOR FOR A SIGNATURE BASED INTRUSION DETECTION SYSTEM WITH NETWORK ATTACK DETECTION CAPABILITIES". 2013 http://sdiwc.net/digital-library/automated-signature-creator-for-a-signature-based-intrusion-detection-system-with-network-attack-detection-capabilities-pancakes.html

Associate Data Science Notebook

https://github.com/cyberdefenders/MachineLearning/blob/master/Data_analysis/Bro%20Logs%20from%20Threatglass/Part%201/loaded_scripts%20analysis.ipynb -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/notice.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_1/notice.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/packet_filter.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_1/packet_filter.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/ssl.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_1/ssl.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_1/weird.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_1/weird.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/app_stats.html: -------------------------------------------------------------------------------- 1 | Home

APP_STATS

Download: app_stats Zip File

Abstract

Source

Mike Sconzo

Security Repository

Secrepo.com

Dataset Information

Attribute Information

Relevant Papers

Bro Logs http://gauss.ececs.uc.edu/Courses/c6055/pdf/bro_log_vars.pdf

Neise, Patrick. "Intrusion Detection Through Relationship Analysis". Oct 2016 https://www.sans.org/reading-room/whitepapers/detection/intrusion-detection-relationship-analysis-37352

Frances Bernadette C. De Ocampo, Trisha Mari L. Del Castillo, Miguel Alberto N. Gomez. "AUTOMATED SIGNATURE CREATOR FOR A SIGNATURE BASED INTRUSION DETECTION SYSTEM WITH NETWORK ATTACK DETECTION CAPABILITIES". 2013 http://sdiwc.net/digital-library/automated-signature-creator-for-a-signature-based-intrusion-detection-system-with-network-attack-detection-capabilities-pancakes.html

Associate Data Science Notebook

https://github.com/cyberdefenders/MachineLearning/blob/master/Data_analysis/Bro%20Logs%20from%20Threatglass/Part%202/app_stats%20analysis.ipynb -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/conn.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_2/conn.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/dhcp.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_2/dhcp.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/dns.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_2/dns.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/dpd.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_2/dpd.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/files.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_2/files.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/ftp.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_2/ftp.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/http.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_2/http.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/irc.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_2/irc.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/loaded_scripts.html: -------------------------------------------------------------------------------- 1 | Home

LOADED_SCRIPTS

Download: loaded_scripts Zip File

Abstract

Source

Mike Sconzo

Security Repository

Secrepo.com

Dataset Information

Log the loaded scripts.

Attribute Information

Relevant Papers

Bro Logs http://gauss.ececs.uc.edu/Courses/c6055/pdf/bro_log_vars.pdf

Neise, Patrick. "Intrusion Detection Through Relationship Analysis". Oct 2016 https://www.sans.org/reading-room/whitepapers/detection/intrusion-detection-relationship-analysis-37352

Frances Bernadette C. De Ocampo, Trisha Mari L. Del Castillo, Miguel Alberto N. Gomez. "AUTOMATED SIGNATURE CREATOR FOR A SIGNATURE BASED INTRUSION DETECTION SYSTEM WITH NETWORK ATTACK DETECTION CAPABILITIES". 2013 http://sdiwc.net/digital-library/automated-signature-creator-for-a-signature-based-intrusion-detection-system-with-network-attack-detection-capabilities-pancakes.html

Associate Data Science Notebook

https://github.com/cyberdefenders/MachineLearning/blob/master/Data_analysis/Bro%20Logs%20from%20Threatglass/Part%202/loaded_scripts%20analysis.ipynb -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/notice.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_2/notice.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/packet_filter.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_2/packet_filter.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/ssl.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_2/ssl.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_2/weird.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_2/weird.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/app_stats.html: -------------------------------------------------------------------------------- 1 | Home

APP_STATS

Download: app_stats Zip File

Abstract

Source

Mike Sconzo

Security Repository

Secrepo.com

Dataset Information

Attribute Information

Relevant Papers

Bro Logs http://gauss.ececs.uc.edu/Courses/c6055/pdf/bro_log_vars.pdf

Neise, Patrick. "Intrusion Detection Through Relationship Analysis". Oct 2016 https://www.sans.org/reading-room/whitepapers/detection/intrusion-detection-relationship-analysis-37352

Frances Bernadette C. De Ocampo, Trisha Mari L. Del Castillo, Miguel Alberto N. Gomez. "AUTOMATED SIGNATURE CREATOR FOR A SIGNATURE BASED INTRUSION DETECTION SYSTEM WITH NETWORK ATTACK DETECTION CAPABILITIES". 2013 http://sdiwc.net/digital-library/automated-signature-creator-for-a-signature-based-intrusion-detection-system-with-network-attack-detection-capabilities-pancakes.html

Associate Data Science Notebook

https://github.com/cyberdefenders/MachineLearning/blob/master/Data_analysis/Bro%20Logs%20from%20Threatglass/Part%203/app_stats%20analysis.ipynb -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/conn.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_3/conn.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/dhcp.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_3/dhcp.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/dns.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_3/dns.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/dpd.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_3/dpd.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/files.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_3/files.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/http.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_3/http.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/irc.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_3/irc.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/loaded_scripts.html: -------------------------------------------------------------------------------- 1 | Home

LOADED_SCRIPTS

Download: loaded_scripts Zip File

Abstract

Source

Mike Sconzo

Security Repository

Secrepo.com

Dataset Information

Log the loaded scripts.

Attribute Information

Relevant Papers

Bro Logs http://gauss.ececs.uc.edu/Courses/c6055/pdf/bro_log_vars.pdf

Neise, Patrick. "Intrusion Detection Through Relationship Analysis". Oct 2016 https://www.sans.org/reading-room/whitepapers/detection/intrusion-detection-relationship-analysis-37352

Frances Bernadette C. De Ocampo, Trisha Mari L. Del Castillo, Miguel Alberto N. Gomez. "AUTOMATED SIGNATURE CREATOR FOR A SIGNATURE BASED INTRUSION DETECTION SYSTEM WITH NETWORK ATTACK DETECTION CAPABILITIES". 2013 http://sdiwc.net/digital-library/automated-signature-creator-for-a-signature-based-intrusion-detection-system-with-network-attack-detection-capabilities-pancakes.html

Associate Data Science Notebook

https://github.com/cyberdefenders/MachineLearning/blob/master/Data_analysis/Bro%20Logs%20from%20Threatglass/Part%203/loaded_scripts%20analysis.ipynb -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/notice.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_3/notice.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/packet_filter.html: -------------------------------------------------------------------------------- 1 | Home

PACKET_FILTER

Download: packet_filter Zip File

Abstract

Source

Mike Sconzo

Security Repository

Secrepo.com

Dataset Information

This script supports how Bro sets its BPF capture filter. By default Bro sets a capture filter that allows all traffic. If a filter is set on the command line, that filter takes precedence over the default open filter and all filters defined in Bro scripts with the capture_filters and restrict_filters variables.

Attribute Information

Relevant Papers

Bro Logs http://gauss.ececs.uc.edu/Courses/c6055/pdf/bro_log_vars.pdf

Neise, Patrick. "Intrusion Detection Through Relationship Analysis". Oct 2016 https://www.sans.org/reading-room/whitepapers/detection/intrusion-detection-relationship-analysis-37352

Frances Bernadette C. De Ocampo, Trisha Mari L. Del Castillo, Miguel Alberto N. Gomez. "AUTOMATED SIGNATURE CREATOR FOR A SIGNATURE BASED INTRUSION DETECTION SYSTEM WITH NETWORK ATTACK DETECTION CAPABILITIES". 2013 http://sdiwc.net/digital-library/automated-signature-creator-for-a-signature-based-intrusion-detection-system-with-network-attack-detection-capabilities-pancakes.html

Associate Data Science Notebook

https://github.com/cyberdefenders/MachineLearning/blob/master/Data_analysis/Bro%20Logs%20from%20Threatglass/Part%203/packet_filter%20analysis.ipynb -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/ssl.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_3/ssl.html -------------------------------------------------------------------------------- /Data/Bro log from Threatglass/HTML_Bro_log_3/weird.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/Bro log from Threatglass/HTML_Bro_log_3/weird.html -------------------------------------------------------------------------------- /Data/Bro logs from Threatglass datasets/Bro logs from Threatglass datasets.txt: -------------------------------------------------------------------------------- 1 | https://drive.google.com/drive/folders/1P9dC1WXEUrypY0Y9lWnVZgFB1S5ZZ6jd -------------------------------------------------------------------------------- /Data/PE malware dataset description/OPCleaver.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/PE malware dataset description/OPCleaver.html -------------------------------------------------------------------------------- /Data/PE malware dataset description/VirusShare.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/PE malware dataset description/VirusShare.html -------------------------------------------------------------------------------- /Data/PE malware dataset description/Zeus.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/PE malware dataset description/Zeus.html -------------------------------------------------------------------------------- /Data/PE malware datasets/PE malware datasets.txt: -------------------------------------------------------------------------------- 1 | https://drive.google.com/drive/folders/1u-AFeS8Dctz5vP7Ohdq82npuANwPOoRk -------------------------------------------------------------------------------- /Data/System datasets description/auth.html: -------------------------------------------------------------------------------- 1 | Home

AUTH

Download: auth Zip File

Abstract

Source

Mike Sconzo

Security Repository

Secrepo.com

Dataset Information

-

Attribute Information

Relevant Papers

Associate Data Science Notebook

https://github.com/cyberdefenders/MachineLearning/blob/master/Data_analysis/System%20analysis/auth%20analysis.ipynb -------------------------------------------------------------------------------- /Data/System datasets/System datasets.txt: -------------------------------------------------------------------------------- 1 | https://drive.google.com/drive/folders/1g8I08WAe0HaIl5nfZH1X-kIC5ZQwAUOu -------------------------------------------------------------------------------- /Data/network datasets description/dhcp.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/network datasets description/dhcp.html -------------------------------------------------------------------------------- /Data/network datasets description/dns.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/network datasets description/dns.html -------------------------------------------------------------------------------- /Data/network datasets description/files.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/network datasets description/files.html -------------------------------------------------------------------------------- /Data/network datasets description/ftp.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/network datasets description/ftp.html -------------------------------------------------------------------------------- /Data/network datasets description/http.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/network datasets description/http.html -------------------------------------------------------------------------------- /Data/network datasets description/notice.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/network datasets description/notice.html -------------------------------------------------------------------------------- /Data/network datasets description/smtp.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/network datasets description/smtp.html -------------------------------------------------------------------------------- /Data/network datasets description/ssh.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/network datasets description/ssh.html -------------------------------------------------------------------------------- /Data/network datasets description/ssl.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/network datasets description/ssl.html -------------------------------------------------------------------------------- /Data/network datasets description/tunnel.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/network datasets description/tunnel.html -------------------------------------------------------------------------------- /Data/network datasets description/weird.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberdefendersprogram/MachineLearning/56f4551433e091bcd8b185df7fc0048ba3a7bf00/Data/network datasets description/weird.html -------------------------------------------------------------------------------- /Data/network datasets/network datasets.txt: -------------------------------------------------------------------------------- 1 | https://drive.google.com/open?id=1qBmJhVqPprD-esGKgtm6VLn_YOpjawJ- -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 1/app_stats analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"app_stats.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | "
Data TypeCountUnique ValuesMissing Values
tsfloat649427980
ts_deltafloat6494210
appobject94230
uniq_hostsint6494210
hitsint64942470
bytesint649427700
\n", 100 | "
" 101 | ], 102 | "text/plain": [ 103 | " Data Type Count Unique Values Missing Values\n", 104 | "ts float64 942 798 0\n", 105 | "ts_delta float64 942 1 0\n", 106 | "app object 942 3 0\n", 107 | "uniq_hosts int64 942 1 0\n", 108 | "hits int64 942 47 0\n", 109 | "bytes int64 942 770 0" 110 | ] 111 | }, 112 | "execution_count": 2, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "#DataFrame with columns\n", 119 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 120 | "\n", 121 | "#DataFrame with data types\n", 122 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 123 | "\n", 124 | "#DataFrame with Count\n", 125 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 126 | "\n", 127 | "#DataFrame with unique values\n", 128 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 129 | "for v in list(df.columns.values):\n", 130 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 131 | "\n", 132 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 133 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 134 | "print('Data Quality Report')\n", 135 | "data_quality_report" 136 | ] 137 | } 138 | ], 139 | "metadata": { 140 | "kernelspec": { 141 | "display_name": "Python 3", 142 | "language": "python", 143 | "name": "python3" 144 | }, 145 | "language_info": { 146 | "codemirror_mode": { 147 | "name": "ipython", 148 | "version": 3 149 | }, 150 | "file_extension": ".py", 151 | "mimetype": "text/x-python", 152 | "name": "python", 153 | "nbconvert_exporter": "python", 154 | "pygments_lexer": "ipython3", 155 | "version": "3.6.5" 156 | } 157 | }, 158 | "nbformat": 4, 159 | "nbformat_minor": 2 160 | } 161 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 1/dhcp analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"dhcp.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
Data TypeCountUnique ValuesMissing Values
tsfloat64520652060
uidobject520626010
id.orig_hobject52061280
id.orig_pint64520610
id.resp_hobject52061280
id.resp_pint64520610
macobject52061280
assigned_ipobject52061280
lease_timefloat64520610
trans_idint64520652060
\n", 128 | "
" 129 | ], 130 | "text/plain": [ 131 | " Data Type Count Unique Values Missing Values\n", 132 | "ts float64 5206 5206 0\n", 133 | "uid object 5206 2601 0\n", 134 | "id.orig_h object 5206 128 0\n", 135 | "id.orig_p int64 5206 1 0\n", 136 | "id.resp_h object 5206 128 0\n", 137 | "id.resp_p int64 5206 1 0\n", 138 | "mac object 5206 128 0\n", 139 | "assigned_ip object 5206 128 0\n", 140 | "lease_time float64 5206 1 0\n", 141 | "trans_id int64 5206 5206 0" 142 | ] 143 | }, 144 | "execution_count": 2, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "#DataFrame with columns\n", 151 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 152 | "\n", 153 | "#DataFrame with data types\n", 154 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 155 | "\n", 156 | "#DataFrame with Count\n", 157 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 158 | "\n", 159 | "#DataFrame with unique values\n", 160 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 161 | "for v in list(df.columns.values):\n", 162 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 163 | "\n", 164 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 165 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 166 | "print('Data Quality Report')\n", 167 | "data_quality_report" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 3", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.6.5" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 1/dpd analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"dpd.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 6, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | "
Data TypeCountUnique ValuesMissing Values
tsfloat6488880
uidobject88880
id.orig_hobject88310
id.orig_pint6488650
id.resp_hobject88300
id.resp_pint648860
protoobject8810
analyzerobject8820
failure_reasonobject8840
\n", 121 | "
" 122 | ], 123 | "text/plain": [ 124 | " Data Type Count Unique Values Missing Values\n", 125 | "ts float64 88 88 0\n", 126 | "uid object 88 88 0\n", 127 | "id.orig_h object 88 31 0\n", 128 | "id.orig_p int64 88 65 0\n", 129 | "id.resp_h object 88 30 0\n", 130 | "id.resp_p int64 88 6 0\n", 131 | "proto object 88 1 0\n", 132 | "analyzer object 88 2 0\n", 133 | "failure_reason object 88 4 0" 134 | ] 135 | }, 136 | "execution_count": 6, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "#DataFrame with columns\n", 143 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 144 | "\n", 145 | "#DataFrame with data types\n", 146 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 147 | "\n", 148 | "#DataFrame with Count\n", 149 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 150 | "\n", 151 | "#DataFrame with unique values\n", 152 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 153 | "for v in list(df.columns.values):\n", 154 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 155 | "\n", 156 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 157 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 158 | "print('Data Quality Report')\n", 159 | "data_quality_report" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.6.5" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 2 191 | } 192 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 1/irc analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"irc.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | "
Data TypeCountUnique ValuesMissing Values
tsfloat6430200
uidobject3080
id.orig_hobject3070
id.orig_pint643070
id.resp_hobject3070
id.resp_pint643070
nickobject2288
userobject14616
commandobject3030
valueobject30200
addlobject22138
dcc_file_namefloat640030
dcc_file_sizefloat640030
dcc_mime_typefloat640030
fuidfloat640030
\n", 163 | "
" 164 | ], 165 | "text/plain": [ 166 | " Data Type Count Unique Values Missing Values\n", 167 | "ts float64 30 20 0\n", 168 | "uid object 30 8 0\n", 169 | "id.orig_h object 30 7 0\n", 170 | "id.orig_p int64 30 7 0\n", 171 | "id.resp_h object 30 7 0\n", 172 | "id.resp_p int64 30 7 0\n", 173 | "nick object 22 8 8\n", 174 | "user object 14 6 16\n", 175 | "command object 30 3 0\n", 176 | "value object 30 20 0\n", 177 | "addl object 22 13 8\n", 178 | "dcc_file_name float64 0 0 30\n", 179 | "dcc_file_size float64 0 0 30\n", 180 | "dcc_mime_type float64 0 0 30\n", 181 | "fuid float64 0 0 30" 182 | ] 183 | }, 184 | "execution_count": 2, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "#DataFrame with columns\n", 191 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 192 | "\n", 193 | "#DataFrame with data types\n", 194 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 195 | "\n", 196 | "#DataFrame with Count\n", 197 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 198 | "\n", 199 | "#DataFrame with unique values\n", 200 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 201 | "for v in list(df.columns.values):\n", 202 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 203 | "\n", 204 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 205 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 206 | "print('Data Quality Report')\n", 207 | "data_quality_report" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [] 216 | } 217 | ], 218 | "metadata": { 219 | "kernelspec": { 220 | "display_name": "Python 3", 221 | "language": "python", 222 | "name": "python3" 223 | }, 224 | "language_info": { 225 | "codemirror_mode": { 226 | "name": "ipython", 227 | "version": 3 228 | }, 229 | "file_extension": ".py", 230 | "mimetype": "text/x-python", 231 | "name": "python", 232 | "nbconvert_exporter": "python", 233 | "pygments_lexer": "ipython3", 234 | "version": "3.6.5" 235 | } 236 | }, 237 | "nbformat": 4, 238 | "nbformat_minor": 2 239 | } 240 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 1/loaded_scripts analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"loaded_scripts.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | "
Data TypeCountUnique ValuesMissing Values
nameobject6472502650
\n", 65 | "
" 66 | ], 67 | "text/plain": [ 68 | " Data Type Count Unique Values Missing Values\n", 69 | "name object 647250 265 0" 70 | ] 71 | }, 72 | "execution_count": 2, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "#DataFrame with columns\n", 79 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 80 | "\n", 81 | "#DataFrame with data types\n", 82 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 83 | "\n", 84 | "#DataFrame with Count\n", 85 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 86 | "\n", 87 | "#DataFrame with unique values\n", 88 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 89 | "for v in list(df.columns.values):\n", 90 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 91 | "\n", 92 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 93 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 94 | "print('Data Quality Report')\n", 95 | "data_quality_report" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [] 104 | } 105 | ], 106 | "metadata": { 107 | "kernelspec": { 108 | "display_name": "Python 3", 109 | "language": "python", 110 | "name": "python3" 111 | }, 112 | "language_info": { 113 | "codemirror_mode": { 114 | "name": "ipython", 115 | "version": 3 116 | }, 117 | "file_extension": ".py", 118 | "mimetype": "text/x-python", 119 | "name": "python", 120 | "nbconvert_exporter": "python", 121 | "pygments_lexer": "ipython3", 122 | "version": "3.6.5" 123 | } 124 | }, 125 | "nbformat": 4, 126 | "nbformat_minor": 2 127 | } 128 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 1/packet_filter analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"packet_filter.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | "
Data TypeCountUnique ValuesMissing Values
tsfloat64258925890
nodeobject258910
filterobject258910
initobject258910
successobject258910
\n", 93 | "
" 94 | ], 95 | "text/plain": [ 96 | " Data Type Count Unique Values Missing Values\n", 97 | "ts float64 2589 2589 0\n", 98 | "node object 2589 1 0\n", 99 | "filter object 2589 1 0\n", 100 | "init object 2589 1 0\n", 101 | "success object 2589 1 0" 102 | ] 103 | }, 104 | "execution_count": 2, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "#DataFrame with columns\n", 111 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 112 | "\n", 113 | "#DataFrame with data types\n", 114 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 115 | "\n", 116 | "#DataFrame with Count\n", 117 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 118 | "\n", 119 | "#DataFrame with unique values\n", 120 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 121 | "for v in list(df.columns.values):\n", 122 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 123 | "\n", 124 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 125 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 126 | "print('Data Quality Report')\n", 127 | "data_quality_report" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [] 136 | } 137 | ], 138 | "metadata": { 139 | "kernelspec": { 140 | "display_name": "Python 3", 141 | "language": "python", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "codemirror_mode": { 146 | "name": "ipython", 147 | "version": 3 148 | }, 149 | "file_extension": ".py", 150 | "mimetype": "text/x-python", 151 | "name": "python", 152 | "nbconvert_exporter": "python", 153 | "pygments_lexer": "ipython3", 154 | "version": "3.6.5" 155 | } 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 2 159 | } 160 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 1/weird analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"weird.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
Data TypeCountUnique ValuesMissing Values
tsfloat6414005138920
uidobject11416110742589
id.orig_hobject114161352589
id.orig_pfloat64114165602589
id.resp_hobject1141610212589
id.resp_pfloat6411416242589
nameobject14005300
addlobject2214003
noticeobject1400510
peerobject1400510
\n", 128 | "
" 129 | ], 130 | "text/plain": [ 131 | " Data Type Count Unique Values Missing Values\n", 132 | "ts float64 14005 13892 0\n", 133 | "uid object 11416 11074 2589\n", 134 | "id.orig_h object 11416 135 2589\n", 135 | "id.orig_p float64 11416 560 2589\n", 136 | "id.resp_h object 11416 1021 2589\n", 137 | "id.resp_p float64 11416 24 2589\n", 138 | "name object 14005 30 0\n", 139 | "addl object 2 2 14003\n", 140 | "notice object 14005 1 0\n", 141 | "peer object 14005 1 0" 142 | ] 143 | }, 144 | "execution_count": 2, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "#DataFrame with columns\n", 151 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 152 | "\n", 153 | "#DataFrame with data types\n", 154 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 155 | "\n", 156 | "#DataFrame with Count\n", 157 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 158 | "\n", 159 | "#DataFrame with unique values\n", 160 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 161 | "for v in list(df.columns.values):\n", 162 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 163 | "\n", 164 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 165 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 166 | "print('Data Quality Report')\n", 167 | "data_quality_report" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 3", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.6.5" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 2/app_stats analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"app_stats.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | "
Data TypeCountUnique ValuesMissing Values
tsfloat645944980
ts_deltafloat6459410
appobject59430
uniq_hostsint6459410
hitsint64594400
bytesint645944930
\n", 100 | "
" 101 | ], 102 | "text/plain": [ 103 | " Data Type Count Unique Values Missing Values\n", 104 | "ts float64 594 498 0\n", 105 | "ts_delta float64 594 1 0\n", 106 | "app object 594 3 0\n", 107 | "uniq_hosts int64 594 1 0\n", 108 | "hits int64 594 40 0\n", 109 | "bytes int64 594 493 0" 110 | ] 111 | }, 112 | "execution_count": 2, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "#DataFrame with columns\n", 119 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 120 | "\n", 121 | "#DataFrame with data types\n", 122 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 123 | "\n", 124 | "#DataFrame with Count\n", 125 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 126 | "\n", 127 | "#DataFrame with unique values\n", 128 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 129 | "for v in list(df.columns.values):\n", 130 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 131 | "\n", 132 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 133 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 134 | "print('Data Quality Report')\n", 135 | "data_quality_report" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.6.5" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 2 167 | } 168 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 2/dhcpanalysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"dhcp.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
Data TypeCountUnique ValuesMissing Values
tsfloat64339933990
uidobject339916990
id.orig_hobject33991280
id.orig_pint64339910
id.resp_hobject33991280
id.resp_pint64339910
macobject33991280
assigned_ipobject33991280
lease_timefloat64339910
trans_idint64339933990
\n", 128 | "
" 129 | ], 130 | "text/plain": [ 131 | " Data Type Count Unique Values Missing Values\n", 132 | "ts float64 3399 3399 0\n", 133 | "uid object 3399 1699 0\n", 134 | "id.orig_h object 3399 128 0\n", 135 | "id.orig_p int64 3399 1 0\n", 136 | "id.resp_h object 3399 128 0\n", 137 | "id.resp_p int64 3399 1 0\n", 138 | "mac object 3399 128 0\n", 139 | "assigned_ip object 3399 128 0\n", 140 | "lease_time float64 3399 1 0\n", 141 | "trans_id int64 3399 3399 0" 142 | ] 143 | }, 144 | "execution_count": 2, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "#DataFrame with columns\n", 151 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 152 | "\n", 153 | "#DataFrame with data types\n", 154 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 155 | "\n", 156 | "#DataFrame with Count\n", 157 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 158 | "\n", 159 | "#DataFrame with unique values\n", 160 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 161 | "for v in list(df.columns.values):\n", 162 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 163 | "\n", 164 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 165 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 166 | "print('Data Quality Report')\n", 167 | "data_quality_report" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 3", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.6.5" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 2/dpd analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"dpd.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | "
Data TypeCountUnique ValuesMissing Values
tsfloat6449490
uidobject49490
id.orig_hobject49110
id.orig_pint6449420
id.resp_hobject49160
id.resp_pint644940
protoobject4910
analyzerobject4910
failure_reasonobject4910
\n", 121 | "
" 122 | ], 123 | "text/plain": [ 124 | " Data Type Count Unique Values Missing Values\n", 125 | "ts float64 49 49 0\n", 126 | "uid object 49 49 0\n", 127 | "id.orig_h object 49 11 0\n", 128 | "id.orig_p int64 49 42 0\n", 129 | "id.resp_h object 49 16 0\n", 130 | "id.resp_p int64 49 4 0\n", 131 | "proto object 49 1 0\n", 132 | "analyzer object 49 1 0\n", 133 | "failure_reason object 49 1 0" 134 | ] 135 | }, 136 | "execution_count": 2, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "#DataFrame with columns\n", 143 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 144 | "\n", 145 | "#DataFrame with data types\n", 146 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 147 | "\n", 148 | "#DataFrame with Count\n", 149 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 150 | "\n", 151 | "#DataFrame with unique values\n", 152 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 153 | "for v in list(df.columns.values):\n", 154 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 155 | "\n", 156 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 157 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 158 | "print('Data Quality Report')\n", 159 | "data_quality_report" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.6.5" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 2 191 | } 192 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 2/irc analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"irc.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | "
Data TypeCountUnique ValuesMissing Values
tsfloat64640
uidobject620
id.orig_hobject620
id.orig_pint64620
id.resp_hobject620
id.resp_pint64620
nickobject422
userobject224
commandobject630
valueobject660
addlobject442
dcc_file_namefloat64006
dcc_file_sizefloat64006
dcc_mime_typefloat64006
fuidfloat64006
\n", 163 | "
" 164 | ], 165 | "text/plain": [ 166 | " Data Type Count Unique Values Missing Values\n", 167 | "ts float64 6 4 0\n", 168 | "uid object 6 2 0\n", 169 | "id.orig_h object 6 2 0\n", 170 | "id.orig_p int64 6 2 0\n", 171 | "id.resp_h object 6 2 0\n", 172 | "id.resp_p int64 6 2 0\n", 173 | "nick object 4 2 2\n", 174 | "user object 2 2 4\n", 175 | "command object 6 3 0\n", 176 | "value object 6 6 0\n", 177 | "addl object 4 4 2\n", 178 | "dcc_file_name float64 0 0 6\n", 179 | "dcc_file_size float64 0 0 6\n", 180 | "dcc_mime_type float64 0 0 6\n", 181 | "fuid float64 0 0 6" 182 | ] 183 | }, 184 | "execution_count": 2, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "#DataFrame with columns\n", 191 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 192 | "\n", 193 | "#DataFrame with data types\n", 194 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 195 | "\n", 196 | "#DataFrame with Count\n", 197 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 198 | "\n", 199 | "#DataFrame with unique values\n", 200 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 201 | "for v in list(df.columns.values):\n", 202 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 203 | "\n", 204 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 205 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 206 | "print('Data Quality Report')\n", 207 | "data_quality_report" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [] 216 | } 217 | ], 218 | "metadata": { 219 | "kernelspec": { 220 | "display_name": "Python 3", 221 | "language": "python", 222 | "name": "python3" 223 | }, 224 | "language_info": { 225 | "codemirror_mode": { 226 | "name": "ipython", 227 | "version": 3 228 | }, 229 | "file_extension": ".py", 230 | "mimetype": "text/x-python", 231 | "name": "python", 232 | "nbconvert_exporter": "python", 233 | "pygments_lexer": "ipython3", 234 | "version": "3.6.5" 235 | } 236 | }, 237 | "nbformat": 4, 238 | "nbformat_minor": 2 239 | } 240 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 2/loaded_scripts analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"loaded_scripts.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | "
Data TypeCountUnique ValuesMissing Values
nameobject4230002610
\n", 65 | "
" 66 | ], 67 | "text/plain": [ 68 | " Data Type Count Unique Values Missing Values\n", 69 | "name object 423000 261 0" 70 | ] 71 | }, 72 | "execution_count": 2, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "#DataFrame with columns\n", 79 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 80 | "\n", 81 | "#DataFrame with data types\n", 82 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 83 | "\n", 84 | "#DataFrame with Count\n", 85 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 86 | "\n", 87 | "#DataFrame with unique values\n", 88 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 89 | "for v in list(df.columns.values):\n", 90 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 91 | "\n", 92 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 93 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 94 | "print('Data Quality Report')\n", 95 | "data_quality_report" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [] 104 | } 105 | ], 106 | "metadata": { 107 | "kernelspec": { 108 | "display_name": "Python 3", 109 | "language": "python", 110 | "name": "python3" 111 | }, 112 | "language_info": { 113 | "codemirror_mode": { 114 | "name": "ipython", 115 | "version": 3 116 | }, 117 | "file_extension": ".py", 118 | "mimetype": "text/x-python", 119 | "name": "python", 120 | "nbconvert_exporter": "python", 121 | "pygments_lexer": "ipython3", 122 | "version": "3.6.5" 123 | } 124 | }, 125 | "nbformat": 4, 126 | "nbformat_minor": 2 127 | } 128 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 2/packet_filter analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"packet_filter.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | "
Data TypeCountUnique ValuesMissing Values
tsfloat64169216920
nodeobject169210
filterobject169210
initobject169210
successobject169210
\n", 93 | "
" 94 | ], 95 | "text/plain": [ 96 | " Data Type Count Unique Values Missing Values\n", 97 | "ts float64 1692 1692 0\n", 98 | "node object 1692 1 0\n", 99 | "filter object 1692 1 0\n", 100 | "init object 1692 1 0\n", 101 | "success object 1692 1 0" 102 | ] 103 | }, 104 | "execution_count": 2, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "#DataFrame with columns\n", 111 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 112 | "\n", 113 | "#DataFrame with data types\n", 114 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 115 | "\n", 116 | "#DataFrame with Count\n", 117 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 118 | "\n", 119 | "#DataFrame with unique values\n", 120 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 121 | "for v in list(df.columns.values):\n", 122 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 123 | "\n", 124 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 125 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 126 | "print('Data Quality Report')\n", 127 | "data_quality_report" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [] 136 | } 137 | ], 138 | "metadata": { 139 | "kernelspec": { 140 | "display_name": "Python 3", 141 | "language": "python", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "codemirror_mode": { 146 | "name": "ipython", 147 | "version": 3 148 | }, 149 | "file_extension": ".py", 150 | "mimetype": "text/x-python", 151 | "name": "python", 152 | "nbconvert_exporter": "python", 153 | "pygments_lexer": "ipython3", 154 | "version": "3.6.5" 155 | } 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 2 159 | } 160 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 2/weird analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"weird.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
Data TypeCountUnique ValuesMissing Values
tsfloat64958395210
uidobject789176901692
id.orig_hobject78911291692
id.orig_pfloat6478913771692
id.resp_hobject78916241692
id.resp_pfloat647891131692
nameobject9583230
addlobject719576
noticeobject958310
peerobject958310
\n", 128 | "
" 129 | ], 130 | "text/plain": [ 131 | " Data Type Count Unique Values Missing Values\n", 132 | "ts float64 9583 9521 0\n", 133 | "uid object 7891 7690 1692\n", 134 | "id.orig_h object 7891 129 1692\n", 135 | "id.orig_p float64 7891 377 1692\n", 136 | "id.resp_h object 7891 624 1692\n", 137 | "id.resp_p float64 7891 13 1692\n", 138 | "name object 9583 23 0\n", 139 | "addl object 7 1 9576\n", 140 | "notice object 9583 1 0\n", 141 | "peer object 9583 1 0" 142 | ] 143 | }, 144 | "execution_count": 2, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "#DataFrame with columns\n", 151 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 152 | "\n", 153 | "#DataFrame with data types\n", 154 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 155 | "\n", 156 | "#DataFrame with Count\n", 157 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 158 | "\n", 159 | "#DataFrame with unique values\n", 160 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 161 | "for v in list(df.columns.values):\n", 162 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 163 | "\n", 164 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 165 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 166 | "print('Data Quality Report')\n", 167 | "data_quality_report" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 3", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.6.5" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 3/app_stats analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"app_stats.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | "
Data TypeCountUnique ValuesMissing Values
tsfloat648787270
ts_deltafloat6487810
appobject87830
uniq_hostsint6487810
hitsint64878460
bytesint648787150
\n", 100 | "
" 101 | ], 102 | "text/plain": [ 103 | " Data Type Count Unique Values Missing Values\n", 104 | "ts float64 878 727 0\n", 105 | "ts_delta float64 878 1 0\n", 106 | "app object 878 3 0\n", 107 | "uniq_hosts int64 878 1 0\n", 108 | "hits int64 878 46 0\n", 109 | "bytes int64 878 715 0" 110 | ] 111 | }, 112 | "execution_count": 2, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "#DataFrame with columns\n", 119 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 120 | "\n", 121 | "#DataFrame with data types\n", 122 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 123 | "\n", 124 | "#DataFrame with Count\n", 125 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 126 | "\n", 127 | "#DataFrame with unique values\n", 128 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 129 | "for v in list(df.columns.values):\n", 130 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 131 | "\n", 132 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 133 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 134 | "print('Data Quality Report')\n", 135 | "data_quality_report" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.6.5" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 2 167 | } 168 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 3/dhcp analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"dhcp.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
Data TypeCountUnique ValuesMissing Values
tsfloat64478047800
uidobject478023930
id.orig_hobject47801280
id.orig_pint64478010
id.resp_hobject47801280
id.resp_pint64478010
macobject47801280
assigned_ipobject47801280
lease_timefloat64478010
trans_idint64478047800
\n", 128 | "
" 129 | ], 130 | "text/plain": [ 131 | " Data Type Count Unique Values Missing Values\n", 132 | "ts float64 4780 4780 0\n", 133 | "uid object 4780 2393 0\n", 134 | "id.orig_h object 4780 128 0\n", 135 | "id.orig_p int64 4780 1 0\n", 136 | "id.resp_h object 4780 128 0\n", 137 | "id.resp_p int64 4780 1 0\n", 138 | "mac object 4780 128 0\n", 139 | "assigned_ip object 4780 128 0\n", 140 | "lease_time float64 4780 1 0\n", 141 | "trans_id int64 4780 4780 0" 142 | ] 143 | }, 144 | "execution_count": 2, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "#DataFrame with columns\n", 151 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 152 | "\n", 153 | "#DataFrame with data types\n", 154 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 155 | "\n", 156 | "#DataFrame with Count\n", 157 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 158 | "\n", 159 | "#DataFrame with unique values\n", 160 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 161 | "for v in list(df.columns.values):\n", 162 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 163 | "\n", 164 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 165 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 166 | "print('Data Quality Report')\n", 167 | "data_quality_report" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 3", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.6.5" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 3/dpd analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"dpd.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | "
Data TypeCountUnique ValuesMissing Values
tsfloat6475750
uidobject75750
id.orig_hobject75240
id.orig_pint6475610
id.resp_hobject75290
id.resp_pint647540
protoobject7510
analyzerobject7510
failure_reasonobject7520
\n", 121 | "
" 122 | ], 123 | "text/plain": [ 124 | " Data Type Count Unique Values Missing Values\n", 125 | "ts float64 75 75 0\n", 126 | "uid object 75 75 0\n", 127 | "id.orig_h object 75 24 0\n", 128 | "id.orig_p int64 75 61 0\n", 129 | "id.resp_h object 75 29 0\n", 130 | "id.resp_p int64 75 4 0\n", 131 | "proto object 75 1 0\n", 132 | "analyzer object 75 1 0\n", 133 | "failure_reason object 75 2 0" 134 | ] 135 | }, 136 | "execution_count": 2, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "#DataFrame with columns\n", 143 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 144 | "\n", 145 | "#DataFrame with data types\n", 146 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 147 | "\n", 148 | "#DataFrame with Count\n", 149 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 150 | "\n", 151 | "#DataFrame with unique values\n", 152 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 153 | "for v in list(df.columns.values):\n", 154 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 155 | "\n", 156 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 157 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 158 | "print('Data Quality Report')\n", 159 | "data_quality_report" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.6.5" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 2 191 | } 192 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 3/irc analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"irc.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | "
Data TypeCountUnique ValuesMissing Values
tsfloat6421150
uidobject2160
id.orig_hobject2160
id.orig_pint642160
id.resp_hobject2150
id.resp_pint642150
nickobject1566
userobject9612
commandobject2130
valueobject21180
addlobject15126
dcc_file_namefloat640021
dcc_file_sizefloat640021
dcc_mime_typefloat640021
fuidfloat640021
\n", 163 | "
" 164 | ], 165 | "text/plain": [ 166 | " Data Type Count Unique Values Missing Values\n", 167 | "ts float64 21 15 0\n", 168 | "uid object 21 6 0\n", 169 | "id.orig_h object 21 6 0\n", 170 | "id.orig_p int64 21 6 0\n", 171 | "id.resp_h object 21 5 0\n", 172 | "id.resp_p int64 21 5 0\n", 173 | "nick object 15 6 6\n", 174 | "user object 9 6 12\n", 175 | "command object 21 3 0\n", 176 | "value object 21 18 0\n", 177 | "addl object 15 12 6\n", 178 | "dcc_file_name float64 0 0 21\n", 179 | "dcc_file_size float64 0 0 21\n", 180 | "dcc_mime_type float64 0 0 21\n", 181 | "fuid float64 0 0 21" 182 | ] 183 | }, 184 | "execution_count": 2, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "#DataFrame with columns\n", 191 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 192 | "\n", 193 | "#DataFrame with data types\n", 194 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 195 | "\n", 196 | "#DataFrame with Count\n", 197 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 198 | "\n", 199 | "#DataFrame with unique values\n", 200 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 201 | "for v in list(df.columns.values):\n", 202 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 203 | "\n", 204 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 205 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 206 | "print('Data Quality Report')\n", 207 | "data_quality_report" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [] 216 | } 217 | ], 218 | "metadata": { 219 | "kernelspec": { 220 | "display_name": "Python 3", 221 | "language": "python", 222 | "name": "python3" 223 | }, 224 | "language_info": { 225 | "codemirror_mode": { 226 | "name": "ipython", 227 | "version": 3 228 | }, 229 | "file_extension": ".py", 230 | "mimetype": "text/x-python", 231 | "name": "python", 232 | "nbconvert_exporter": "python", 233 | "pygments_lexer": "ipython3", 234 | "version": "3.6.5" 235 | } 236 | }, 237 | "nbformat": 4, 238 | "nbformat_minor": 2 239 | } 240 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 3/loaded_scripts analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"loaded_scripts.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | "
Data TypeCountUnique ValuesMissing Values
nameobject5955002650
\n", 65 | "
" 66 | ], 67 | "text/plain": [ 68 | " Data Type Count Unique Values Missing Values\n", 69 | "name object 595500 265 0" 70 | ] 71 | }, 72 | "execution_count": 2, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "#DataFrame with columns\n", 79 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 80 | "\n", 81 | "#DataFrame with data types\n", 82 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 83 | "\n", 84 | "#DataFrame with Count\n", 85 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 86 | "\n", 87 | "#DataFrame with unique values\n", 88 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 89 | "for v in list(df.columns.values):\n", 90 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 91 | "\n", 92 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 93 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 94 | "print('Data Quality Report')\n", 95 | "data_quality_report" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [] 104 | } 105 | ], 106 | "metadata": { 107 | "kernelspec": { 108 | "display_name": "Python 3", 109 | "language": "python", 110 | "name": "python3" 111 | }, 112 | "language_info": { 113 | "codemirror_mode": { 114 | "name": "ipython", 115 | "version": 3 116 | }, 117 | "file_extension": ".py", 118 | "mimetype": "text/x-python", 119 | "name": "python", 120 | "nbconvert_exporter": "python", 121 | "pygments_lexer": "ipython3", 122 | "version": "3.6.5" 123 | } 124 | }, 125 | "nbformat": 4, 126 | "nbformat_minor": 2 127 | } 128 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 3/packet_filter analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"packet_filter.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | "
Data TypeCountUnique ValuesMissing Values
tsfloat64238223820
nodeobject238210
filterobject238210
initobject238210
successobject238210
\n", 93 | "
" 94 | ], 95 | "text/plain": [ 96 | " Data Type Count Unique Values Missing Values\n", 97 | "ts float64 2382 2382 0\n", 98 | "node object 2382 1 0\n", 99 | "filter object 2382 1 0\n", 100 | "init object 2382 1 0\n", 101 | "success object 2382 1 0" 102 | ] 103 | }, 104 | "execution_count": 2, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "#DataFrame with columns\n", 111 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 112 | "\n", 113 | "#DataFrame with data types\n", 114 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 115 | "\n", 116 | "#DataFrame with Count\n", 117 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 118 | "\n", 119 | "#DataFrame with unique values\n", 120 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 121 | "for v in list(df.columns.values):\n", 122 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 123 | "\n", 124 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 125 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 126 | "print('Data Quality Report')\n", 127 | "data_quality_report" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [] 136 | } 137 | ], 138 | "metadata": { 139 | "kernelspec": { 140 | "display_name": "Python 3", 141 | "language": "python", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "codemirror_mode": { 146 | "name": "ipython", 147 | "version": 3 148 | }, 149 | "file_extension": ".py", 150 | "mimetype": "text/x-python", 151 | "name": "python", 152 | "nbconvert_exporter": "python", 153 | "pygments_lexer": "ipython3", 154 | "version": "3.6.5" 155 | } 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 2 159 | } 160 | -------------------------------------------------------------------------------- /Data_analysis/Bro Logs from Threatglass/Part 3/weird analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"weird.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
Data TypeCountUnique ValuesMissing Values
tsfloat6412873127950
uidobject10491101932382
id.orig_hobject104911332382
id.orig_pfloat64104916562382
id.resp_hobject1049110192382
id.resp_pfloat6410491182382
nameobject12873300
addlobject3212870
noticeobject1287310
peerobject1287310
\n", 128 | "
" 129 | ], 130 | "text/plain": [ 131 | " Data Type Count Unique Values Missing Values\n", 132 | "ts float64 12873 12795 0\n", 133 | "uid object 10491 10193 2382\n", 134 | "id.orig_h object 10491 133 2382\n", 135 | "id.orig_p float64 10491 656 2382\n", 136 | "id.resp_h object 10491 1019 2382\n", 137 | "id.resp_p float64 10491 18 2382\n", 138 | "name object 12873 30 0\n", 139 | "addl object 3 2 12870\n", 140 | "notice object 12873 1 0\n", 141 | "peer object 12873 1 0" 142 | ] 143 | }, 144 | "execution_count": 2, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "#DataFrame with columns\n", 151 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 152 | "\n", 153 | "#DataFrame with data types\n", 154 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 155 | "\n", 156 | "#DataFrame with Count\n", 157 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 158 | "\n", 159 | "#DataFrame with unique values\n", 160 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 161 | "for v in list(df.columns.values):\n", 162 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 163 | "\n", 164 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 165 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 166 | "print('Data Quality Report')\n", 167 | "data_quality_report" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 3", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.6.5" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /Data_analysis/Network analysis/dhcp analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"dhcp.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
Data TypeCountUnique ValuesMissing Values
tsfloat64150214970
uidobject150214180
id.orig_hobject15021000
id.orig_pint64150210
id.resp_hobject150230
id.resp_pint64150210
macobject1502870
assigned_ipobject1502990
lease_timefloat64150220
trans_idint64150214760
\n", 128 | "
" 129 | ], 130 | "text/plain": [ 131 | " Data Type Count Unique Values Missing Values\n", 132 | "ts float64 1502 1497 0\n", 133 | "uid object 1502 1418 0\n", 134 | "id.orig_h object 1502 100 0\n", 135 | "id.orig_p int64 1502 1 0\n", 136 | "id.resp_h object 1502 3 0\n", 137 | "id.resp_p int64 1502 1 0\n", 138 | "mac object 1502 87 0\n", 139 | "assigned_ip object 1502 99 0\n", 140 | "lease_time float64 1502 2 0\n", 141 | "trans_id int64 1502 1476 0" 142 | ] 143 | }, 144 | "execution_count": 2, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "#DataFrame with columns\n", 151 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 152 | "\n", 153 | "#DataFrame with data types\n", 154 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 155 | "\n", 156 | "#DataFrame with Count\n", 157 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 158 | "\n", 159 | "#DataFrame with unique values\n", 160 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 161 | "for v in list(df.columns.values):\n", 162 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 163 | "\n", 164 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 165 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 166 | "print('Data Quality Report')\n", 167 | "data_quality_report" 168 | ] 169 | } 170 | ], 171 | "metadata": { 172 | "kernelspec": { 173 | "display_name": "Python 3", 174 | "language": "python", 175 | "name": "python3" 176 | }, 177 | "language_info": { 178 | "codemirror_mode": { 179 | "name": "ipython", 180 | "version": 3 181 | }, 182 | "file_extension": ".py", 183 | "mimetype": "text/x-python", 184 | "name": "python", 185 | "nbconvert_exporter": "python", 186 | "pygments_lexer": "ipython3", 187 | "version": "3.6.5" 188 | } 189 | }, 190 | "nbformat": 4, 191 | "nbformat_minor": 2 192 | } 193 | -------------------------------------------------------------------------------- /Data_analysis/Network analysis/ftp analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"ftp.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | "
Data TypeCountUnique ValuesMissing Values
tsfloat64579623900
uidobject57961370
id.orig_hobject5796150
id.orig_pint645796950
id.resp_hobject5796210
id.resp_pint64579610
userobject579640
passwordobject57451251
commandobject579660
argobject296615452830
mime_typeobject9585701
file_sizefloat64105785691
reply_codefloat645756940
reply_msgobject5756418440
passiveobject289722899
orig_hobject2897192899
resp_hobject2897202899
resp_pfloat64289727692899
fuidobject5486466310
\n", 191 | "
" 192 | ], 193 | "text/plain": [ 194 | " Data Type Count Unique Values Missing Values\n", 195 | "ts float64 5796 2390 0\n", 196 | "uid object 5796 137 0\n", 197 | "id.orig_h object 5796 15 0\n", 198 | "id.orig_p int64 5796 95 0\n", 199 | "id.resp_h object 5796 21 0\n", 200 | "id.resp_p int64 5796 1 0\n", 201 | "user object 5796 4 0\n", 202 | "password object 5745 12 51\n", 203 | "command object 5796 6 0\n", 204 | "arg object 2966 1545 2830\n", 205 | "mime_type object 95 8 5701\n", 206 | "file_size float64 105 78 5691\n", 207 | "reply_code float64 5756 9 40\n", 208 | "reply_msg object 5756 4184 40\n", 209 | "passive object 2897 2 2899\n", 210 | "orig_h object 2897 19 2899\n", 211 | "resp_h object 2897 20 2899\n", 212 | "resp_p float64 2897 2769 2899\n", 213 | "fuid object 5486 466 310" 214 | ] 215 | }, 216 | "execution_count": 2, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "#DataFrame with columns\n", 223 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 224 | "\n", 225 | "#DataFrame with data types\n", 226 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 227 | "\n", 228 | "#DataFrame with Count\n", 229 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 230 | "\n", 231 | "#DataFrame with unique values\n", 232 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 233 | "for v in list(df.columns.values):\n", 234 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 235 | "\n", 236 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 237 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 238 | "print('Data Quality Report')\n", 239 | "data_quality_report" 240 | ] 241 | } 242 | ], 243 | "metadata": { 244 | "kernelspec": { 245 | "display_name": "Python 3", 246 | "language": "python", 247 | "name": "python3" 248 | }, 249 | "language_info": { 250 | "codemirror_mode": { 251 | "name": "ipython", 252 | "version": 3 253 | }, 254 | "file_extension": ".py", 255 | "mimetype": "text/x-python", 256 | "name": "python", 257 | "nbconvert_exporter": "python", 258 | "pygments_lexer": "ipython3", 259 | "version": "3.6.5" 260 | } 261 | }, 262 | "nbformat": 4, 263 | "nbformat_minor": 2 264 | } 265 | -------------------------------------------------------------------------------- /Data_analysis/Network analysis/ssh analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"weird.csv\", low_memory=False)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 8, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
Data TypeCountUnique ValuesMissing Values
tsfloat6465983440440
uidobject6552651651457
id.orig_hobject65526221457
id.orig_pfloat646552625772457
id.resp_hobject65526279457
id.resp_pfloat6465526183457
nameobject65983500
addlobject86512565118
noticeobject6598310
peerobject6598310
\n", 128 | "
" 129 | ], 130 | "text/plain": [ 131 | " Data Type Count Unique Values Missing Values\n", 132 | "ts float64 65983 44044 0\n", 133 | "uid object 65526 51651 457\n", 134 | "id.orig_h object 65526 221 457\n", 135 | "id.orig_p float64 65526 25772 457\n", 136 | "id.resp_h object 65526 279 457\n", 137 | "id.resp_p float64 65526 183 457\n", 138 | "name object 65983 50 0\n", 139 | "addl object 865 125 65118\n", 140 | "notice object 65983 1 0\n", 141 | "peer object 65983 1 0" 142 | ] 143 | }, 144 | "execution_count": 8, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "#DataFrame with columns\n", 151 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 152 | "\n", 153 | "#DataFrame with data types\n", 154 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 155 | "\n", 156 | "#DataFrame with Count\n", 157 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 158 | "\n", 159 | "#DataFrame with unique values\n", 160 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 161 | "for v in list(df.columns.values):\n", 162 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 163 | "\n", 164 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 165 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 166 | "print('Data Quality Report')\n", 167 | "data_quality_report" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 3", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.6.5" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /Data_analysis/Network analysis/tunnel analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"tunnel.csv\")" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | "
Data TypeCountUnique ValuesMissing Values
tsfloat642802800
uidobject2801400
id.orig_hobject28030
id.orig_pint642801390
id.resp_hobject280810
id.resp_pint6428010
tunnel_typeobject28010
actionobject28020
\n", 114 | "
" 115 | ], 116 | "text/plain": [ 117 | " Data Type Count Unique Values Missing Values\n", 118 | "ts float64 280 280 0\n", 119 | "uid object 280 140 0\n", 120 | "id.orig_h object 280 3 0\n", 121 | "id.orig_p int64 280 139 0\n", 122 | "id.resp_h object 280 81 0\n", 123 | "id.resp_p int64 280 1 0\n", 124 | "tunnel_type object 280 1 0\n", 125 | "action object 280 2 0" 126 | ] 127 | }, 128 | "execution_count": 2, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "#DataFrame with columns\n", 135 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 136 | "\n", 137 | "#DataFrame with data types\n", 138 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 139 | "\n", 140 | "#DataFrame with Count\n", 141 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 142 | "\n", 143 | "#DataFrame with unique values\n", 144 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 145 | "for v in list(df.columns.values):\n", 146 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 147 | "\n", 148 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 149 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 150 | "print('Data Quality Report')\n", 151 | "data_quality_report" 152 | ] 153 | } 154 | ], 155 | "metadata": { 156 | "kernelspec": { 157 | "display_name": "Python 3", 158 | "language": "python", 159 | "name": "python3" 160 | }, 161 | "language_info": { 162 | "codemirror_mode": { 163 | "name": "ipython", 164 | "version": 3 165 | }, 166 | "file_extension": ".py", 167 | "mimetype": "text/x-python", 168 | "name": "python", 169 | "nbconvert_exporter": "python", 170 | "pygments_lexer": "ipython3", 171 | "version": "3.6.5" 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 2 176 | } 177 | -------------------------------------------------------------------------------- /Data_analysis/Network analysis/weird analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from mpl_toolkits.mplot3d import Axes3D\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "\n", 13 | "df = pd.read_csv(\"weird.csv\", low_memory=False)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Data Quality Report\n" 26 | ] 27 | }, 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
Data TypeCountUnique ValuesMissing Values
tsfloat6465983440440
uidobject6552651651457
id.orig_hobject65526221457
id.orig_pfloat646552625772457
id.resp_hobject65526279457
id.resp_pfloat6465526183457
nameobject65983500
addlobject86512565118
noticeobject6598310
peerobject6598310
\n", 128 | "
" 129 | ], 130 | "text/plain": [ 131 | " Data Type Count Unique Values Missing Values\n", 132 | "ts float64 65983 44044 0\n", 133 | "uid object 65526 51651 457\n", 134 | "id.orig_h object 65526 221 457\n", 135 | "id.orig_p float64 65526 25772 457\n", 136 | "id.resp_h object 65526 279 457\n", 137 | "id.resp_p float64 65526 183 457\n", 138 | "name object 65983 50 0\n", 139 | "addl object 865 125 65118\n", 140 | "notice object 65983 1 0\n", 141 | "peer object 65983 1 0" 142 | ] 143 | }, 144 | "execution_count": 3, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "#DataFrame with columns\n", 151 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 152 | "\n", 153 | "#DataFrame with data types\n", 154 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 155 | "\n", 156 | "#DataFrame with Count\n", 157 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 158 | "\n", 159 | "#DataFrame with unique values\n", 160 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 161 | "for v in list(df.columns.values):\n", 162 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 163 | "\n", 164 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 165 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 166 | "print('Data Quality Report')\n", 167 | "data_quality_report" 168 | ] 169 | } 170 | ], 171 | "metadata": { 172 | "kernelspec": { 173 | "display_name": "Python 3", 174 | "language": "python", 175 | "name": "python3" 176 | }, 177 | "language_info": { 178 | "codemirror_mode": { 179 | "name": "ipython", 180 | "version": 3 181 | }, 182 | "file_extension": ".py", 183 | "mimetype": "text/x-python", 184 | "name": "python", 185 | "nbconvert_exporter": "python", 186 | "pygments_lexer": "ipython3", 187 | "version": "3.6.5" 188 | } 189 | }, 190 | "nbformat": 4, 191 | "nbformat_minor": 2 192 | } 193 | -------------------------------------------------------------------------------- /Data_analysis/System analysis/auth analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "\n", 11 | "df = pd.read_csv(\"auth.csv\")" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 8, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "Data Quality Report\n" 24 | ] 25 | }, 26 | { 27 | "data": { 28 | "text/html": [ 29 | "
\n", 30 | "\n", 43 | "\n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | "
Data TypeCountUnique ValuesMissing Values
tsobject86839231400
ipobject86839278950
daemonobject86839186830
resultobject86839479070
\n", 84 | "
" 85 | ], 86 | "text/plain": [ 87 | " Data Type Count Unique Values Missing Values\n", 88 | "ts object 86839 23140 0\n", 89 | "ip object 86839 27895 0\n", 90 | "daemon object 86839 18683 0\n", 91 | "result object 86839 47907 0" 92 | ] 93 | }, 94 | "execution_count": 8, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "#DataFrame with columns\n", 101 | "columns = pd.DataFrame(list(df.columns.values[1:]))\n", 102 | "\n", 103 | "#DataFrame with data types\n", 104 | "data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])\n", 105 | "\n", 106 | "#DataFrame with Count\n", 107 | "data_count = pd.DataFrame(df.count(), columns=['Count'])\n", 108 | "\n", 109 | "#DataFrame with unique values\n", 110 | "unique_value_counts = pd.DataFrame(columns=['Unique Values'])\n", 111 | "for v in list(df.columns.values):\n", 112 | " unique_value_counts.loc[v] = [df[v].nunique()]\n", 113 | "\n", 114 | "missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values'])\n", 115 | "data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts)\n", 116 | "print('Data Quality Report')\n", 117 | "data_quality_report" 118 | ] 119 | } 120 | ], 121 | "metadata": { 122 | "kernelspec": { 123 | "display_name": "Python 3", 124 | "language": "python", 125 | "name": "python3" 126 | }, 127 | "language_info": { 128 | "codemirror_mode": { 129 | "name": "ipython", 130 | "version": 3 131 | }, 132 | "file_extension": ".py", 133 | "mimetype": "text/x-python", 134 | "name": "python", 135 | "nbconvert_exporter": "python", 136 | "pygments_lexer": "ipython3", 137 | "version": "3.7.0" 138 | } 139 | }, 140 | "nbformat": 4, 141 | "nbformat_minor": 2 142 | } 143 | -------------------------------------------------------------------------------- /Machine_learning_practice/machine_learning.py: -------------------------------------------------------------------------------- 1 | # Import packages needed 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn import svm 6 | from sklearn import preprocessing 7 | import time 8 | 9 | # This function builds training dataset and testing dataset 10 | def Build_Data_Set(): 11 | df = pd.read_csv('../Data/out.csv', index_col=0) # 100000 entries 12 | df_train = df[:50000] # Take last 50000 rows as a training dataset 13 | df_test = df[10000:] # Take first 10000 rows as a testing dataset 14 | 15 | # Clean up training dataset and scale it 16 | X_train = np.array(df_train.drop(['classification','usage_counter', 'normal_prio', 'policy', 'vm_pgoff', 'task_size', 'cached_hole_size', 'hiwater_rss', 'nr_ptes', 'lock', 'cgtime', 'signal_nvcsw'], 1)) 17 | X_train = preprocessing.scale(X_train) 18 | # Training label 19 | y_train = np.array(df_train['classification'].replace("malware",0).replace("benign",1)) 20 | 21 | # Clean up testing dataset and scale it 22 | X_test = np.array(df_test.drop(['classification','usage_counter', 'normal_prio', 'policy', 'vm_pgoff', 'task_size', 'cached_hole_size', 'hiwater_rss', 'nr_ptes', 'lock', 'cgtime', 'signal_nvcsw'], 1)) 23 | X_test = preprocessing.scale(X_test) 24 | #Testing label 25 | y_test = np.array(df_test['classification'].replace("malware",0).replace("benign",1)) 26 | 27 | return X_train, X_test, y_train, y_test # Return arrays 28 | 29 | # This function builds a machine learning model using scikit-learn svm algorithm and compute the Accuracy of the prediction 30 | def Analysis(): 31 | test_size = 10000 # The size of the testing dataset 32 | X_train, X_test, y_train, y_test = Build_Data_Set() # Building training and testing datasets 33 | 34 | clf = svm.SVC(kernel="linear", C=0.01) # Declare a svm object in with scikit-learn package 35 | clf.fit(X_train, y_train) # Training the model with the traning dataset and labels 36 | result = clf.predict(X_test) # Running a prediction with 10000 samples 37 | 38 | # Compute the accuracy and print it out 39 | correct_count = 0 40 | for i in range(0,test_size): 41 | if result[i] == y_test[i]: 42 | correct_count += 1 43 | 44 | print("Accuracy:", (correct_count/test_size)*100) 45 | return 46 | 47 | 48 | start = time.time() 49 | Analysis() # run the program 50 | end = time.time() 51 | 52 | elapsed = end - start 53 | 54 | print("Time:",elapsed) 55 | 56 | 57 | # import sha3 58 | # import hashlib 59 | # # encoding GeeksforGeeks using md5 hash 60 | # # function 61 | # df = pd.read_csv('new_data.csv', index_col=0) # 100000 entries 62 | 63 | # # s = hashlib.sha3_512() 64 | # # s.update(b"hello") 65 | # # print(s.hexdigest()) 66 | 67 | # df['hash'] = df['hash'].apply(hash) 68 | # df.to_csv('out.csv') 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MachineLearning - DataSet Quality Research 2 | 3 | This page will contain our progress in creating a report detailing the quality of diferent data sets. 4 | 5 | We have aquired permission from Mike Sconzo, owner of secrepo.com, to use his security datasets to analyze and report on the data. 6 | 7 | 8 | # Security Datasets for Machine Learning 9 | by Tien Tran, Citlalin Galvan, Vivian Nguyen, Huy Nguyen 10 | 11 | ## WHY FOCUS ON DATASETS? 12 | Machine Learning is on the rise ⇑ 13 | 14 | A Machine Learning Algorithm can: 15 | Detect Suspicious Activity 16 | Stop malicious files from executing 17 | 18 | The Problem: 19 | One critical problem in Machine Learning is the limited data for security and the quality of training datasets in Cyber Security. Without a good quality dataset, a Machine Learning Algorithm cannot learn properly. 20 | 21 | ## Collecting the DataSets 22 | Downloading SecRepo’s Datasets 23 | 24 | PE Malware Dataset 25 | featureExtraction.py 26 | 27 | Network Dataset 28 | Network_LogtoCSV.py 29 | 30 | Bro Logs Dataset 31 | Brolog_LogtoCSV.py 32 | 33 | System Dataset 34 | System_LogtoCSV.py 35 | System_Squid_LogtoCSV.py 36 | 37 | ## Analysis Reports 38 | Detailing the data inside the Datasets with Jupyter Notebook 39 | 40 | Elements in Data Quality Report: 41 | 42 | Data Type 43 | 44 | Count 45 | 46 | Unique Values 47 | 48 | Missing Values 49 | 50 | Minimum Values 51 | 52 | Maximum Values 53 | 54 | ## Description Reports 55 | 56 | Report Format 57 | 58 | Abstract 59 | 60 | Source 61 | 62 | Dataset Information 63 | 64 | Attribute Information 65 | 66 | Relevant Papers 67 | 68 | Associate Data Science Notebook 69 | 70 | -------------------------------------------------------------------------------- /Scripts/LogToCsv.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | # Create a log file into a csv file so we can manipulate it with pandas 4 | dhcp_path = '/Users/citlalingalvan/Downloads/dhcp.log' 5 | with open('output.csv', 'w+', encoding='utf-8') as csvfile: 6 | w = csv.writer(csvfile, dialect='excel') 7 | with open(dhcp_path, encoding="utf8") as file: 8 | lines = file.read().split('\n') 9 | files = [] 10 | for line in lines: 11 | files.append(line.split('\t')) 12 | w.writerows(files) -------------------------------------------------------------------------------- /Scripts/LogtoCsvConverter.py: -------------------------------------------------------------------------------- 1 | # Create a log file into a csv file so we can manipulate it with pandas 2 | 3 | import csv 4 | import os 5 | 6 | 7 | dhcp_path = input("Enter File Path: ") 8 | 9 | print("******Please wait while it is converted******") 10 | 11 | with open('NewCsvFile.csv', 'w+') as csvfile: 12 | w = csv.writer(csvfile, dialect='excel') 13 | with open(dhcp_path) as file: 14 | lines = file.read().split('\n') 15 | file = [] 16 | for line in lines: 17 | file.append(line.split('\t')) 18 | w.writerows(file) 19 | 20 | print("*********** Succesfully converted ***********") 21 | 22 | # rename file 23 | renameFile = input('Rename File *add .csv* : ') 24 | os.rename('NewCsvFile.csv', renameFile) 25 | 26 | print("Succesfully Renamed: ") 27 | -------------------------------------------------------------------------------- /Scripts/NetworkLogToCSV.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | dic = {"dhcp.log":["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "mac", "assigned_ip", "lease_time", "trans_id"], "dns.log":["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "proto", "port", "query", "qclass", "qclass_name", "qtype", "qtype_name", "rcode", "rcode_name", "QR", "AA", "TC", "RD", "Z", "answers", "TTLs", "rejected"], "ftp.log":["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "user", "password", "command", "arg","mime_type", "file_size", "reply_code", "reply_msg", "passive", "orig_h", "resp_h", "resp_p", "fuid"], "ssh.log":["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "status", "direction", "client", "server", "resp_size"], "files.log":["ts", "fuid", "tx_hosts", "rx_hosts", "conn_uids", "source", "depth", "analyzers", "mime_type", "filename", "duration", "local_orig", "is_orig", "seen_bytes", "total_bytes", "missing_bytes", "overflow_bytes", "timedout","parent_fuid", "md5/sha1/sha256", "extracted"], "http.log":["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "trans_depth", "method", "host", "uri", "referrer", "user_agent", "request_ body_len", "response_ body_len", "status_code", "status_msg", "info_code", "info_msg", "filename", "tags", "username", "password", "proxied", "orig_fuids", "orig_mime_types", "resp_fuids", "resp_mime_types"], "notice.log":["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "fuid", "file_mime_type", "file_desc", "proto", "note", "msg", "sub", "src", "dst", "p", "n", "peer_descr", "actions", "suppress_for", "dropped"],"smtp.log":["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "proto", "trans_depth", "helo", "mailfrom", "rcptto", "date", "from", "to", "in_reply_to", "subject", "x_originating_ip", "first_received", "second_received", "last_reply", "path", "user_agent", "tls", "fuids", "is_webmail"],"ssl.log":["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "version", "cipher", "server_name", "session_id", "subject", "issuer_subject", "not_valid_before", "not_valid_after", "last_alert", "client_subject", "clnt_issuer_subject", "cer_hash", "validation_status"],"tunnel.log":["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "tunnel_type", "action"],"weird.log":["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "name", "addl", "notice", "peer"]} 4 | 5 | path = "network" 6 | for filename in os.listdir(path): 7 | with open(path+"/"+filename.replace("log","csv"), 'w+', encoding='utf-8', newline='') as csvfile: 8 | w = csv.writer(csvfile, dialect='excel') 9 | with open(path+"/"+filename, encoding="utf8") as file: 10 | lines = file.read().split('\n') 11 | lines=lines[:-1] 12 | # print(lines) 13 | files = [dic[filename]] 14 | for line in lines: 15 | cells = [] 16 | for item in line.split('\t'): 17 | if item=="-": 18 | cells.append(item.replace("-","")) 19 | else: 20 | cells.append(item) 21 | files.append(cells) 22 | w.writerows(files) -------------------------------------------------------------------------------- /Scripts/System_Squid_LogToCSV.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Aug 09 11:50:42 2018 4 | 5 | @author: tienz 6 | """ 7 | 8 | # Create a log file into a csv file so we can manipulate it with pandas 9 | 10 | import csv 11 | import os 12 | 13 | path = "access.log" 14 | result = [["time", "elapsed", "remotehost", "code/status", "bytes", "method", "URL", "rfc931", "peerstatus/peerhost", "type"]] 15 | 16 | 17 | with open('access.csv', 'w+', newline='') as csvfile: 18 | w = csv.writer(csvfile, dialect='excel') 19 | with open(path, encoding='utf-8') as file: 20 | lines = file.read().split('\n') 21 | 22 | for line in lines: 23 | ele = line.split(' ') 24 | item=[] 25 | for it in ele: 26 | if not it == "": 27 | if it == "-": 28 | item.append("") 29 | else: 30 | item.append(it) 31 | result.append(item) 32 | w.writerows(result) 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /Scripts/featureExtraction.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mon Jul 30 15:39:02 2018 3 | 4 | @author: Cyber Defenders - Team Aladdin 5 | """ 6 | 7 | 8 | import json 9 | import csv 10 | import os 11 | 12 | 13 | # Define feature header 14 | result = [['FileName', 'SectionAlignment', 'FileAlignment', 'SizeOfHeaders', 'TimeDateStamp', 'ImageBase', 'SizeOfImage', 'DllCharacteristics', 'Characteristics', 'HighEntropy', 'LowEntropy', 'TotalSuspiciousSections', 'TotalNonSuspiciousSections']] 15 | 16 | # Define standard sections 17 | standardSection = ['.text', '.rdata', '.data', '.rsrc'] 18 | 19 | # This function is to check if an element exist in a list 20 | def checkExist(listSection, ele): 21 | for item in listSection: 22 | if item == ele: 23 | return True 24 | return False 25 | # Define the path of the folder which contain json files 26 | path = 'zeus' 27 | 28 | 29 | for filename in os.listdir(path): 30 | file = open("./"+path+"/"+filename, 'r') 31 | 32 | for line in file: 33 | j = json.loads(line) 34 | 35 | # HighEntropy and LowEntropy Extraction 36 | highEntropy = 0 37 | lowEntropy = 0 38 | highest = 0 39 | lowest = 8 40 | for item in j['PE Sections']: 41 | if item['Entropy'] < lowest: 42 | lowest = item['Entropy'] 43 | if item['Entropy'] > highest: 44 | highest = item['Entropy'] 45 | 46 | if highest > 7: 47 | highEntropy = 1 48 | if lowest < 1: 49 | lowEntropy = 1 50 | 51 | # TotalSuspiciousSections and TotalNonSuspiciousSections extraction 52 | numberSuspicious = 0 53 | numberNonSuspicious = 0 54 | for item in j['PE Sections']: 55 | if checkExist(standardSection, item['Name']['Value']): 56 | numberNonSuspicious += 1 57 | else: 58 | numberSuspicious += 1 59 | 60 | 61 | #SectionAlignment Extraction 62 | sectionAlignment = j['OPTIONAL_HEADER']['SectionAlignment']['Value'] 63 | 64 | #FileAlignment Extraction 65 | fileAlignment = j['OPTIONAL_HEADER']['FileAlignment']['Value'] 66 | 67 | # SizeOfHeaders Extraction 68 | sizeOfHeader = j['OPTIONAL_HEADER']['SizeOfHeaders']['Value'] 69 | 70 | timeStamp = j['FILE_HEADER']['TimeDateStamp']['Value'] 71 | 72 | # ImageBase Extraction 73 | imageBase = j['OPTIONAL_HEADER']['ImageBase']['Value'] 74 | 75 | # SizeOfImage Extraction 76 | sizeOfImage = j['OPTIONAL_HEADER']['SizeOfImage']['Value'] 77 | 78 | #DllCharacteristics Extraction 79 | dllCharacteristics = j['OPTIONAL_HEADER']['DllCharacteristics']['Value'] 80 | 81 | # Characteristics Extraction 82 | characteristics = j['FILE_HEADER']['Characteristics']['Value'] 83 | 84 | row = [filename, sectionAlignment, fileAlignment, sizeOfHeader, timeStamp, imageBase, sizeOfImage, dllCharacteristics, characteristics, highEntropy, lowEntropy, numberSuspicious, numberNonSuspicious] 85 | # print(row) 86 | result.append(row) 87 | 88 | with open('Zeus.csv', 'w+', newline='') as f: 89 | thewriter = csv.writer(f) 90 | 91 | thewriter.writerows(result) 92 | 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /Scripts/html_Generator.py: -------------------------------------------------------------------------------- 1 | import gspread 2 | import pandas as pd 3 | from oauth2client.service_account import ServiceAccountCredentials 4 | 5 | # define scopes accept requests 6 | scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] 7 | 8 | # create credentials from json keyfile 9 | credentials = ServiceAccountCredentials.from_json_keyfile_name('your key file name', scope) 10 | 11 | # authorize the credential 12 | gc = gspread.authorize(credentials) 13 | 14 | # Open spreadsheet 15 | wks = gc.open('Your spreadsheet name').sheet1 16 | 17 | # Read each row of the spreadsheet 18 | for i in range(0,len(wks.get_all_records())): 19 | row = wks.get_all_records()[i] 20 | 21 | # data quality table 22 | df = pd.read_csv("../network/"+row["Name of dataset"].replace(" ","")+".csv") 23 | columns = pd.DataFrame(list(df.columns.values[1:])) 24 | #DataFrame with data types 25 | data_types = pd.DataFrame(df.dtypes, columns=['Data Type']) 26 | 27 | #DataFrame with Count 28 | data_count = pd.DataFrame(df.count(), columns=['Count']) 29 | 30 | #DataFrame with unique values 31 | unique_value_counts = pd.DataFrame(columns=['Unique Values']) 32 | for v in list(df.columns.values): 33 | unique_value_counts.loc[v] = [df[v].nunique()] 34 | 35 | missing_data_counts = pd.DataFrame(df.isnull().sum(), columns=['Missing Values']) 36 | data_quality_report = data_types.join(data_count).join(unique_value_counts).join(missing_data_counts) 37 | 38 | # Open a new html file 39 | f = open(row["Name of dataset"].replace(" ","")+".html",'w') 40 | 41 | ''' 42 | Customize content of html files 43 | ''' 44 | message = "" 45 | message += "" 46 | message += "" 47 | message += "Home" 48 | message += "

" + row["Name of dataset"].upper() + "

" 49 | message += "

Download: "+row["Name of dataset"]+" Zip File

" 50 | message += "

Abstract

" 51 | message += "" 73 | message += "

Source

" 74 | 75 | # Split elements of Source section 76 | source = row["Source"].split(",") 77 | for item in source: 78 | message += "

"+item+"

" 79 | message += "

Dataset Information

" 80 | message += "

"+row["Information"]+"

" 81 | message += "

Attribute Information

" 82 | message += "" 85 | message += "

Relevant Papers

" 86 | 87 | # Split elemnts Relevant Papers section 88 | relevant = row["Relevant Papers"].split(";") 89 | for item in relevant: 90 | message += "

"+item.split("|")[0]+""+item.split("|")[1]+"

" 91 | message += "

Associate Data Science Notebook

" 92 | message += ""+row['Associate Data Science Notebook:']+"" 93 | message += "" 94 | message += "" 95 | 96 | # Write the content into the file 97 | f.write(message) 98 | 99 | # Close the file after work 100 | f.close() --------------------------------------------------------------------------------