├── .gitattributes
├── .gitignore
├── LICENSE
├── MalwareArtifacts.csv
├── README.md
├── cybermachine.py
├── extractPE.py
├── malwareML.py
├── requirements.txt
├── spam.csv
├── spamML.py
├── urlML.py
└── url_spam_classification.csv
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 |
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 |
111 | # SageMath parsed files
112 | *.sage.py
113 |
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 |
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 |
127 | # Rope project settings
128 | .ropeproject
129 |
130 | # mkdocs documentation
131 | /site
132 |
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 |
138 | # Pyre type checker
139 | .pyre/
140 |
141 | # pytype static type analyzer
142 | .pytype/
143 |
144 | # Cython debug symbols
145 | cython_debug/
146 |
147 | # PyCharm
148 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | # and can be added to the global gitignore or merged into this file. For a more nuclear
151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/
153 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Emrah Yıldırım
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
CyberMachine 🤖
2 |
3 |
4 |
5 |
6 |
7 | About The Project 📰
8 |
9 | Detects cyber threats to the end user with machine learning. This tool can do malware analysis of given exe file, spam analysis of given url and mail.
10 |
11 |
12 |
13 |
14 | Installation ⏳
15 |
16 | * Clone the repo
17 |
18 | ```sh
19 | git clone https://github.com/emr4h/CyberMachine.git
20 | ```
21 |
22 |
23 |
24 | Getting Started 🕵️♂️
25 |
26 | * Go to the CyberMachine folder:
27 | ```sh
28 | cd CyberMachine
29 | ```
30 |
31 | * Please upload requirements
32 |
33 | ```sh
34 | pip3 install requirements.txt
35 | ```
36 |
37 |
38 |
39 | Usage 👨🏻💻
40 |
41 |
42 | * Then you can access the help menu with the command below:
43 |
44 | ```sh
45 | python3 cybermachine.py --help
46 | ```
47 |
48 |
49 |
50 | * Malware Analysis with ML :
51 |
52 | ```sh
53 | python3 cybermachine.py --exe
54 | ```
55 |
56 | https://user-images.githubusercontent.com/60710585/163443140-a43f407c-ade6-48e0-a87f-c431c2c4fb50.mp4
57 |
58 |
59 |
60 | * Mail Analysis with ML :
61 |
62 | ```sh
63 | python3 cybermachine.py --mail <"message">
64 | ```
65 |
66 | https://user-images.githubusercontent.com/60710585/163433972-d560bfff-b8a6-4215-a502-041203244836.mp4
67 |
68 |
69 |
70 | * Url Analysis with ML :
71 |
72 | ```sh
73 | python3 cybermachine.py --url <"link">
74 | ```
75 |
76 | https://user-images.githubusercontent.com/60710585/163436436-95537447-eb14-4c8a-8aa1-11e4cc86dc1a.mp4
77 |
78 |
79 |
80 |
81 | Details 👀
82 |
83 | If you are curious about the machine learning applications, success rates and analysis approaches used in the project, you can review my repositories below.
84 |
85 |
86 | * Malware Analysis with Machine Learning
87 |
88 | ```sh
89 | https://github.com/emr4h/Malware-Detection-Using-Machine-Learning
90 | ```
91 |
92 | * Spam Analysis with Machine Learning
93 |
94 | ```sh
95 | https://github.com/emr4h/Spam-Email-and-Url-Detection-Using-Machine-Learning
96 | ```
97 |
98 |
99 |
100 | Support 🎗
101 |
102 | If you like the project, please give a star ⭐️ and don't forget to buy me a coffee ☕️
103 |
104 | 
105 |
106 |
107 | Follow me:
108 |
109 |
110 |
111 |
112 |
113 |
114 |
--------------------------------------------------------------------------------
/cybermachine.py:
--------------------------------------------------------------------------------
1 |
2 | import argparse
3 | import subprocess
4 | import random
5 | from pyfiglet import Figlet
6 | from extractPE import fileExtract
7 | from malwareML import machineLearnMalware
8 | from spamML import machineLearnSpam
9 | from urlML import machineLearnUrl
10 |
11 |
12 | print("\n\n\n")
13 |
14 | fontList = ["big","bulbhead","roman","epic","larry3d","speed","nancyj","stampatello","smslant","slscript","serifcap","rounded","puffy","o8","letters","colossal","basic"]
15 | fontType = random.choice(fontList)
16 | f = Figlet(font=fontType)
17 | print(f.renderText('Cyber Machine'))
18 |
19 | print("by emr4h\n")
20 |
21 | parser = argparse.ArgumentParser(prog="hackwall\n", description="Threat Analysis Tool for End User", usage="\n\n Malware Analysis with ML: python3 cybermachine.py --exe \n Email Analysis with ML: python3 cybermachine.py --mail \n Url Analysis with ML: python3 cybermachine.py --url ")
22 | parser.add_argument("--exe", help = "Malware Analysis with ML, give value in exe file type")
23 | parser.add_argument("--mail", type=str, help = "Email Spam Analysis with ML, give value in string type ")
24 | parser.add_argument("--url", type=str, help = "Url Spam Analysis with ML, give value in string type ")
25 |
26 |
27 | args = parser.parse_args()
28 |
29 |
30 | def analysisMalware(argument):
31 | fileExtract(argument)
32 | result = machineLearnMalware()
33 | if(result >=2):
34 | print("ML Prediction --> Malware.\n")
35 | else:
36 | print("ML Prediction --> Secure.\n")
37 | subprocess.call(["rm", "inputData.csv"])
38 |
39 |
40 | def analysisSpam(argument):
41 | result = machineLearnSpam(argument)
42 | if(result >=2):
43 | print("ML Prediction --> Spam.\n")
44 | else:
45 | print("ML Prediction --> Secure.\n")
46 |
47 | def analysisUrl(argument):
48 | result = machineLearnUrl(argument)
49 | if(result >=2):
50 | print("ML Prediction --> Spam.\n")
51 | else:
52 | print("ML Prediction --> Secure.\n")
53 |
54 |
55 | if __name__=='__main__':
56 |
57 | if(args.exe):
58 | analysisMalware(args.exe)
59 |
60 | if(args.mail):
61 | analysisSpam(args.mail)
62 |
63 | if(args.url):
64 | analysisUrl(args.url)
65 |
66 |
--------------------------------------------------------------------------------
/extractPE.py:
--------------------------------------------------------------------------------
1 | import pefile
2 | import csv
3 |
4 | def fileExtract(data):
5 | print("Extracting the PE information of the file...")
6 | header =["AddressOfEntryPoint","MajorLinkerVersion","MajorImageVersion","MajorOperatingSystemVersion","DllCharacteristics","SizeOfStackReserve","NumberOfSections","ResourceSize","IfMalware"]
7 | with open('inputData.csv', 'w', encoding='UTF8', newline='') as f:
8 | writer = csv.writer(f)
9 |
10 | # header bilgilerini ekledik :
11 | writer.writerow(header)
12 |
13 | # zararlı yazılımların bilgilerini ekledik :
14 | pe = pefile.PE(data)
15 | a = str(pe.OPTIONAL_HEADER.AddressOfEntryPoint)
16 | b = str(pe.OPTIONAL_HEADER.MajorLinkerVersion)
17 | c = str(pe.OPTIONAL_HEADER.MajorImageVersion)
18 | d = str(pe.OPTIONAL_HEADER.MajorOperatingSystemVersion)
19 | e = str(pe.OPTIONAL_HEADER.DllCharacteristics)
20 | f = str(pe.OPTIONAL_HEADER.SizeOfStackReserve)
21 | g = str(pe.FILE_HEADER.NumberOfSections)
22 | h = str(pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size)
23 | i = " " # zararlı bilgisini gösterir.
24 | inputData = [a,b,c,d,e,f,g,h,i]
25 | writer.writerow(inputData)
26 | print("The file was successfully extracted.")
27 |
28 |
--------------------------------------------------------------------------------
/malwareML.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn import tree
4 | from sklearn.ensemble import RandomForestClassifier
5 | from sklearn.neighbors import KNeighborsClassifier
6 |
7 |
8 | def machineLearnMalware() :
9 |
10 | dataSet = pd.read_csv('../CyberMachine/MalwareArtifacts.csv')
11 | fileData = pd.read_csv('../CyberMachine/inputData.csv')
12 |
13 | features = dataSet.iloc[:,[0,1,2,3,4,5,6,7]].values
14 | ifMalware = dataSet.iloc[:,8].values
15 |
16 | fileFeatures = fileData.iloc[:,[0,1,2,3,4,5,6,7]].values
17 |
18 | print("The model is training using a total of 137444 data ...\n")
19 | print("Prediction using Decision Trees ...\n")
20 |
21 | dtModel = tree.DecisionTreeClassifier()
22 | dtModel.fit(features, ifMalware)
23 |
24 | dtpredict = dtModel.predict(fileFeatures)
25 | print(dtpredict)
26 | print("\n")
27 |
28 | print("Prediction using Random Forest ...\n")
29 |
30 | rfModel = RandomForestClassifier()
31 | rfModel.fit(features, ifMalware)
32 | rfpredict = rfModel.predict(fileFeatures)
33 | print(rfpredict)
34 | print("\n")
35 |
36 |
37 | print("Prediction using Kneighbors ...\n")
38 |
39 | knnModel = KNeighborsClassifier(n_neighbors=1)
40 | knnModel.fit(features, ifMalware)
41 | knpredict = knnModel.predict(fileFeatures)
42 | print(knpredict)
43 | print("\n")
44 |
45 | predict = int(knpredict + dtpredict + rfpredict)
46 | return predict
47 |
48 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.21.2
2 | pandas==1.3.2
3 | pefile==2021.5.24
4 | pyfiglet==0.7
5 | scikit_learn==1.0.2
6 |
--------------------------------------------------------------------------------
/spamML.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn import tree
4 | from sklearn.ensemble import RandomForestClassifier
5 | from sklearn import svm
6 | from sklearn.feature_extraction.text import CountVectorizer
7 |
8 |
9 | def machineLearnSpam(message):
10 |
11 | dataSet = pd.read_csv('../CyberMachine/spam.csv')
12 | dataSet.Category = dataSet.Category.apply(lambda x: 1 if x == 'spam' else 0)
13 | features = dataSet.iloc[:,1]
14 | ifSpam = dataSet.iloc[:,0]
15 | cv = CountVectorizer()
16 | features = cv.fit_transform(features)
17 | userInput = cv.transform([message])
18 | print("The model is training using a total of 5572 data...\n")
19 |
20 | print("Prediction using Decision Trees ...")
21 |
22 | dtModel = tree.DecisionTreeClassifier()
23 | dtModel.fit(features, ifSpam)
24 | dtPredict = dtModel.predict(userInput)
25 | print(dtPredict)
26 | print("\n")
27 |
28 | print("Prediction using Random Forest ...")
29 |
30 | rfModel = RandomForestClassifier()
31 | rfModel.fit(features, ifSpam)
32 | rfPredict = rfModel.predict(userInput)
33 | print(rfPredict)
34 | print("\n")
35 |
36 |
37 | print("Prediction using Support Vector Machine ...")
38 |
39 | svcModel = svm.SVC()
40 | svcModel.fit(features, ifSpam)
41 | svcPredict = svcModel.predict(userInput)
42 | print(svcPredict)
43 | print("\n")
44 |
45 | predict = int(svcPredict + dtPredict + rfPredict)
46 | return predict
47 |
48 |
--------------------------------------------------------------------------------
/urlML.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.feature_extraction.text import CountVectorizer
4 | from sklearn.naive_bayes import MultinomialNB
5 | from sklearn.svm import LinearSVC
6 | from sklearn.linear_model import SGDClassifier
7 |
8 |
9 | def machineLearnUrl(userInput):
10 |
11 | url = pd.read_csv('../CyberMachine/url_spam_classification.csv')
12 |
13 | url['is_spam'] = url.is_spam.apply(str)
14 | url['is_spam'] = url['is_spam'].apply(lambda x : 1 if x == "True" in x else 0)
15 |
16 | urls = url.iloc[:,0]
17 | ifSpam = url.iloc[:,1]
18 |
19 | def extractUrl(data):
20 | url = str(data)
21 | extractSlash = url.split('/')
22 | result = []
23 |
24 | for i in extractSlash:
25 | extractDash = str(i).split('-')
26 | dotExtract = []
27 |
28 | for j in range(0,len(extractDash)):
29 | extractDot = str(extractDash[j]).split('.')
30 | dotExtract += extractDot
31 |
32 | result += extractDash + dotExtract
33 | result = list(set(result))
34 |
35 | return result
36 |
37 | cv = CountVectorizer(tokenizer=extractUrl)
38 |
39 | print("The model is training using a total of 148303 url data ...\n")
40 |
41 | features = cv.fit_transform(urls)
42 | features_test = cv.transform([userInput])
43 |
44 | print("Prediction using Stochastic Gradient Descent ...")
45 |
46 | sgdcModel = SGDClassifier()
47 | sgdcModel.fit(features, ifSpam)
48 | sgdcPredict = sgdcModel.predict(features_test)
49 | print(sgdcPredict)
50 | print("\n")
51 |
52 | print("Prediction using Decision Trees ...")
53 |
54 | nbModel = MultinomialNB()
55 | nbModel.fit(features, ifSpam)
56 | nbPredict = nbModel.predict(features_test)
57 | print(nbPredict)
58 | print("\n")
59 |
60 | print("Prediction using Linear Support Vector Machine ...")
61 |
62 | lsvcModel = LinearSVC()
63 | lsvcModel.fit(features, ifSpam)
64 | lsvcPredict = lsvcModel.predict(features_test)
65 | print(lsvcPredict)
66 | print("\n")
67 |
68 | predict = int(lsvcPredict + nbPredict + sgdcPredict)
69 | return predict
70 |
71 |
--------------------------------------------------------------------------------