├── ASM ├── ASM file to TEXT file (removing unwanted data).ipynb ├── CSV data based on frequency Count of Opcode.ipynb ├── Data Splitting.ipynb ├── Data Visualization (ASM).ipynb ├── Feature Selection.ipynb ├── Min-max normalization(opcodes).ipynb ├── Read me ├── Reducing opcode feature.ipynb ├── opc_list.npy └── sect_list.npy ├── Byte ├── Byte file to Image Conversion.ipynb ├── Data Visualization (Byte).ipynb ├── Feature extractor (CNN with pretrained encoder layers) - optimization.ipynb ├── Feature extractor (CNN-optimization).ipynb ├── Images to Comma-Separated Values(csv).ipynb ├── Min-max normalization(Byte).ipynb ├── Read me ├── Resizing images to 32x32.ipynb ├── Stratified sampling.ipynb ├── cnn_stacking.py ├── cnn_v1.py └── dataloader_csv.py ├── Hybrid (Final) ├── ANN-Results.ipynb ├── Creating hybrid dataset.ipynb ├── Data Visualization (Hybrid).ipynb ├── Min-max normalization (hybrid dataset).ipynb ├── Read me ├── ann_hybrid.py ├── dataloader_csv.py └── final_hybrid_csv.py └── README.md /ASM/ASM file to TEXT file (removing unwanted data).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# ASM file to TEXT file (removing unwanted data)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Importing libraries and initilizing variables" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from pyparsing import oneOf,Word,hexnums,Optional,WordEnd,alphas,alphanums\n", 24 | "import numpy as np\n", 25 | "\n", 26 | "c_files=['58kxhXouHzFd4g3rmInB','6tfw0xSL2FNHOCJBdlaA','a9oIzfw03ED4lTBCt52Y','cf4nzsoCmudt1kwleOTI','d0iHC6ANYGon7myPFzBe','da3XhOZzQEbKVtLgMYWv','fRLS3aKkijp4GH0Ds6Pv','IidxQvXrlBkWPZAfcqKT']\n", 27 | "\n", 28 | "data_path='...\\\\train\\\\'\n", 29 | "to_save=\"...\\\\asm_to_text\\\\\"\n", 30 | "\n", 31 | "section_list=[]\n", 32 | "opcode_list=[]\n", 33 | "files_not_parsed=[]" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "Reading ID's and labels from csv" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "malwarelist=np.genfromtxt('trainLabels.csv',delimiter=\",\",dtype=str)\n", 50 | "malwarelist=malwarelist[1:]" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "Making an ASM file line parser" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 6, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "hex_integer = Word(hexnums) + WordEnd()\n", 67 | "line = hex_integer + Optional((hex_integer*(1,))(\"instructions\") + Word(alphas,alphanums)(\"opcode\"))" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "This is the main part of the code, as it takes several days to process all files so it is designed in such a way that you can run and stop this section again and again without losing any data." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 7, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "for name in malwarelist:\n", 84 | " b = \".bytes\"\n", 85 | " nam=name[0].strip('\"')\n", 86 | " \n", 87 | "\n", 88 | "# =========Run following commands only if you are not runing this code for the first time =========\n", 89 | "\n", 90 | "# c_files=np.load('c_files.npy')\n", 91 | "# files_not_parsed=np.load('files_not_parsed.npy')\n", 92 | "# c_files=c_files.tolist()\n", 93 | "# files_not_parsed=files_not_parsed.tolist()\n", 94 | "# section_list=np.load('sect_list.npy')\n", 95 | "# opcode_list=np.load('opc_list.npy')\n", 96 | "# section_list=section_list.tolist()\n", 97 | "# opcode_list=opcode_list.tolist()\n", 98 | "# =================================================================================================\n", 99 | "\n", 100 | " \n", 101 | " if nam not in c_files:\n", 102 | " if nam not in files_not_parsed:\n", 103 | " nam=str(nam)\n", 104 | " nam_asm=data_path+nam+\".asm\"\n", 105 | " nam_txt=to_save+nam+\".txt\"\n", 106 | " asm_list=[]\n", 107 | " try:\n", 108 | " with open(nam_asm,encoding='ISO-8859-1') as f:\n", 109 | " lines=f.readlines()\n", 110 | " for source_line in lines:\n", 111 | " if source_line==\"\\n\":\n", 112 | " continue \n", 113 | " section=source_line[:source_line.find(':')+1]\n", 114 | " if section not in section_list:\n", 115 | " section_list.append(str(section)) \n", 116 | " result = line.parseString(source_line[source_line.find(':')+1:])\n", 117 | " if \"opcode\" in result:\n", 118 | " opcod=result.opcode\n", 119 | " x=section+' '+opcod\n", 120 | " asm_list.append(x)\n", 121 | " if opcod not in opcode_list:\n", 122 | " opcode_list.append(str(result.opcode))\n", 123 | " with open(nam_txt, \"a\") as file_prime:\n", 124 | " for i in asm_list:\n", 125 | " file_prime.write(str(i)+ '\\n')\n", 126 | " \n", 127 | " c_files.append(nam) \n", 128 | " except:\n", 129 | " files_not_parsed.append(nam)\n", 130 | " \n", 131 | " np.save('c_files.npy',c_files)\n", 132 | " np.save('files_not_parsed.npy',files_not_parsed)\n", 133 | " np.save('sect_list.npy', section_list)\n", 134 | " np.save('opc_list.npy', opcode_list)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "Note: We also parsed the files individually that are not parsed according to this scheme and update the sect_list and opc_list. You are given the updated sect_list and opc_list." 142 | ] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "Python 3", 148 | "language": "python", 149 | "name": "python3" 150 | }, 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 3 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython3", 161 | "version": "3.7.2" 162 | } 163 | }, 164 | "nbformat": 4, 165 | "nbformat_minor": 2 166 | } 167 | -------------------------------------------------------------------------------- /ASM/CSV data based on frequency Count of Opcode.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Opcode frequency Count based CSV data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "The purpose of this code is to make CSV data that contain the frequency count of all opcode against each sample" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Importing libraries and initializing varivables" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np\n", 31 | "import string\n", 32 | "import csv\n", 33 | "\n", 34 | "c_files=['58kxhXouHzFd4g3rmInB','6tfw0xSL2FNHOCJBdlaA','a9oIzfw03ED4lTBCt52Y','cf4nzsoCmudt1kwleOTI','d0iHC6ANYGon7myPFzBe','da3XhOZzQEbKVtLgMYWv','fRLS3aKkijp4GH0Ds6Pv','IidxQvXrlBkWPZAfcqKT']\n", 35 | "path = '..//asm_to_text//'\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Reading ID's and Labels from CSV" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "malwarelist=np.genfromtxt('trainLabels.csv',delimiter=\",\",dtype=str)\n", 52 | "malwarelist=malwarelist[1:]" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "Making the first row that will be the field names in the CSV" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "opc=np.load('opc_list.npy')\n", 69 | "opc=opc.tolist()\n", 70 | "opc.append('ID')\n", 71 | "opc.append('LABEL')\n", 72 | "data_to_write=[]\n", 73 | "data_to_write.append(opc)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "This is the main section of the code that read text files and counts frequencies of different opcounts in each sample and stores them to the data_to_write list along with their ID's and labels " 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "for name in malwarelist: \n", 90 | " \n", 91 | " nam=name[0].strip('\"')\n", 92 | " nam=str(nam)\n", 93 | " count=count+1\n", 94 | " print(nam,' ',count)\n", 95 | " temp=[0]*len(opc) \n", 96 | " nam=path+nam+\".txt\"\n", 97 | " if name[0].strip('\"') not in wrong: \n", 98 | " with open(nam) as f:\n", 99 | " lis=f.readlines()\n", 100 | " for source_lin in lis:\n", 101 | " if source_lin==\"\\n\":\n", 102 | " continue \n", 103 | " opco=source_lin[source_lin.find(':')+2:source_lin.find('\\\\')] \n", 104 | " temp[opc.index(opco)]=temp[opc.index(opco)]+1\n", 105 | "#=================In case of section use this=============================================\n", 106 | "# section=source_lin[:source_lin.find(':')+1]\n", 107 | "# section=section.translate({ord(c): None for c in string.whitespace})\n", 108 | "#========================================================================================\n", 109 | " temp[opc.index('ID')]=name[0].strip('\"')\n", 110 | " temp[opc.index('LABEL')]=name[1]\n", 111 | " data_to_write.append(temp)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "In this section, CSV is generated" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "myFile = open('data_opcode.csv', 'w', newline='') \n", 128 | "with myFile: \n", 129 | " writer = csv.writer(myFile)\n", 130 | " writer.writerows(data_to_write)" 131 | ] 132 | } 133 | ], 134 | "metadata": { 135 | "kernelspec": { 136 | "display_name": "Python 3", 137 | "language": "python", 138 | "name": "python3" 139 | }, 140 | "language_info": { 141 | "codemirror_mode": { 142 | "name": "ipython", 143 | "version": 3 144 | }, 145 | "file_extension": ".py", 146 | "mimetype": "text/x-python", 147 | "name": "python", 148 | "nbconvert_exporter": "python", 149 | "pygments_lexer": "ipython3", 150 | "version": "3.7.2" 151 | } 152 | }, 153 | "nbformat": 4, 154 | "nbformat_minor": 2 155 | } 156 | -------------------------------------------------------------------------------- /ASM/Data Splitting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Splitting" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this code same splitting is done as with images data. Same samples are put in validation, test, train and train full CSV as in the case of images dataset." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import csv" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "Initializing the variables" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "path_tr='data_to_tra.csv'\n", 41 | "path_futr='data_to_traFull.csv'\n", 42 | "path_te='data_to_test.csv'\n", 43 | "path_val='data_to_val.csv'\n", 44 | "path_opc='500_reduce_opcode.csv'" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "Reading the data from CSV" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "original=np.genfromtxt(path_opc,delimiter=\",\",dtype=str)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "Making training data CSV" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "temp=np.genfromtxt(path_tr,delimiter=\",\",dtype=str)\n", 77 | "\n", 78 | "to_csv=[]\n", 79 | "\n", 80 | "to_csv.append(original[0].tolist())\n", 81 | "\n", 82 | "for i in original:\n", 83 | " if i[-2] in temp[:,-2]:\n", 84 | " to_csv.append(i.tolist())\n", 85 | "\n", 86 | "myFile = open('n_500_data_to_tra.csv', 'w', newline='') \n", 87 | "\n", 88 | "with myFile: \n", 89 | " writer = csv.writer(myFile)\n", 90 | " writer.writerows(to_csv)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "Making train full data CSV" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 5, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "temp=np.genfromtxt(path_futr,delimiter=\",\",dtype=str)\n", 107 | "\n", 108 | "to_csv=[]\n", 109 | "\n", 110 | "to_csv.append(original[0].tolist())\n", 111 | "\n", 112 | "for i in original:\n", 113 | " if i[-2] in temp[:,-2]:\n", 114 | " to_csv.append(i.tolist())\n", 115 | "\n", 116 | "myFile = open('n_500_data_to_traFull.csv', 'w', newline='') \n", 117 | "\n", 118 | "with myFile: \n", 119 | " writer = csv.writer(myFile)\n", 120 | " writer.writerows(to_csv)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Making test CSV" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 6, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "temp=np.genfromtxt(path_te,delimiter=\",\",dtype=str)\n", 137 | "\n", 138 | "to_csv=[]\n", 139 | "\n", 140 | "to_csv.append(original[0].tolist())\n", 141 | "\n", 142 | "for i in original:\n", 143 | " if i[-2] in temp[:,-2]:\n", 144 | " to_csv.append(i.tolist())\n", 145 | "\n", 146 | "myFile = open('n_500_data_to_test.csv', 'w', newline='') \n", 147 | "\n", 148 | "with myFile: \n", 149 | " writer = csv.writer(myFile)\n", 150 | " writer.writerows(to_csv)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Making validation CSV" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 7, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "temp=np.genfromtxt(path_val,delimiter=\",\",dtype=str)\n", 167 | "\n", 168 | "to_csv=[]\n", 169 | "\n", 170 | "to_csv.append(original[0].tolist())\n", 171 | "\n", 172 | "for i in original:\n", 173 | " if i[-2] in temp[:,-2]:\n", 174 | " to_csv.append(i.tolist())\n", 175 | "\n", 176 | "myFile = open('n_500_data_to_val.csv', 'w', newline='') \n", 177 | "\n", 178 | "with myFile: \n", 179 | " writer = csv.writer(myFile)\n", 180 | " writer.writerows(to_csv)" 181 | ] 182 | } 183 | ], 184 | "metadata": { 185 | "kernelspec": { 186 | "display_name": "Python 3", 187 | "language": "python", 188 | "name": "python3" 189 | }, 190 | "language_info": { 191 | "codemirror_mode": { 192 | "name": "ipython", 193 | "version": 3 194 | }, 195 | "file_extension": ".py", 196 | "mimetype": "text/x-python", 197 | "name": "python", 198 | "nbconvert_exporter": "python", 199 | "pygments_lexer": "ipython3", 200 | "version": "3.7.2" 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 2 205 | } 206 | -------------------------------------------------------------------------------- /ASM/Feature Selection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature Selection" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "The following code is used for feature selection using wrapper based feature selection mechanism where SVM with rbf kernel is used as a classifier. Feature selection and backward elimination both techquies are used. top_feature variable is used for selecting the different number of opcodes. It finds out that top 116 features help better in classification. Following is the code where we get features. " 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Importing libraries and initializing variables" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np\n", 31 | "import matplotlib.pyplot as plt\n", 32 | "from sklearn import svm\n", 33 | "from sklearn.metrics import confusion_matrix\n", 34 | "from sklearn.preprocessing import label_binarize\n", 35 | "from sklearn.multiclass import OneVsRestClassifier\n", 36 | "path_tr='nn_500_data_to_tra.csv'\n", 37 | "path_futr='nn_500_data_to_traFull.csv'\n", 38 | "path_te='nn_500_data_to_test.csv'\n", 39 | "path_val='nn_500_data_to_val.csv'\n", 40 | "top_feature=116" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "Reading data" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "train = np.genfromtxt(path_tr, delimiter=\",\", dtype=str)\n", 57 | "val = np.genfromtxt(path_val, delimiter=\",\", dtype=str)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "Data processing" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "x_data = train[1:,0:top_feature]\n", 74 | "x_label = train[1:,-1]\n", 75 | "x_data=x_data.astype(np.float)\n", 76 | "x_label=x_label.astype(np.int)\n", 77 | "# Binarize the output\n", 78 | "x_label = label_binarize(x_label, classes=[1,2,3,4,5,6,7,8,9])\n", 79 | "\n", 80 | "y_data = val[1:,0:top_feature]\n", 81 | "y_data=y_data.astype(np.float)\n", 82 | "y_label = val[1:,-1]\n", 83 | "y_label=y_label.astype(np.int)\n", 84 | "woby=y_label\n", 85 | "# Binarize the output\n", 86 | "y_label = label_binarize(y_label, classes=[1,2,3,4,5,6,7,8,9])" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Applying custom grid search. optimize c and g values store in opt_c,opt_g respectively" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 4, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "c=[0.01,0.1,1,10,100]\n", 103 | "g=[0.01,0.1,1,10,100]\n", 104 | "loss=100\n", 105 | "opt_c=0\n", 106 | "opt_g=0\n", 107 | "for i in c:\n", 108 | " for j in g: \n", 109 | " clf = OneVsRestClassifier(svm.SVC(kernel='rbf',C=i,gamma=j, probability=True))\n", 110 | " clf_f = clf.fit(x_data, x_label)\n", 111 | " y_score=clf_f.decision_function(y_data)\n", 112 | " y_predict_prob=clf_f.predict_proba(y_data)\n", 113 | " y_predict=clf_f.predict(y_data)\n", 114 | " pre=y_predict.argmax(1)\n", 115 | " pre=pre+1\n", 116 | " a=log_loss(y_label,y_predict_prob)\n", 117 | " if a=opcode_freq:\n", 78 | " index_list.append(i)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "In this section row for field name of CSV is created by eliminating the less frequent opcodes" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "data_to_list=[]\n", 95 | "names=[]\n", 96 | "for i,j in enumerate(data_pd):\n", 97 | " if i in index_list:\n", 98 | " names.append(j)\n", 99 | "names.append('ID')\n", 100 | "names.append('LABELS')\n", 101 | "data_to_list.append(names)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "Here data(selected opcodes) of samples copied to data_to_list along with their names and labels" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 5, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "for i in data_arr:\n", 118 | " temp=[]\n", 119 | " for j in index_list:\n", 120 | " temp.append(i[j])\n", 121 | " \n", 122 | " temp.append(i[-2])\n", 123 | " temp.append(i[-1])\n", 124 | " data_to_list.append(temp)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "Saving new data to CSV" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 6, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "myFile = open('500_reduce_opcode.csv', 'w', newline='') \n", 141 | "with myFile: \n", 142 | " writer = csv.writer(myFile)\n", 143 | " writer.writerows(data_to_list)" 144 | ] 145 | } 146 | ], 147 | "metadata": { 148 | "kernelspec": { 149 | "display_name": "Python 3", 150 | "language": "python", 151 | "name": "python3" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 3 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython3", 163 | "version": "3.7.2" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 2 168 | } 169 | -------------------------------------------------------------------------------- /ASM/opc_list.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberhunters/Malware-Detection-Using-Machine-Learning/d07b19ad409582493e7042b83cbb92a0e3906197/ASM/opc_list.npy -------------------------------------------------------------------------------- /ASM/sect_list.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyberhunters/Malware-Detection-Using-Machine-Learning/d07b19ad409582493e7042b83cbb92a0e3906197/ASM/sect_list.npy -------------------------------------------------------------------------------- /Byte/Byte file to Image Conversion.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Byte Files to Image Conversion" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Importing libraries " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from PIL import Image\n", 24 | "import numpy" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "Fixing the width of images. As each byte file size varies so, we have to fix either its width or height for better visualization " 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "width = 1366" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "Reading the data (names and labels) from csv" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "malwarelist=np.genfromtxt('trainLabels.csv',delimiter=\",\",dtype=str)\n", 57 | "malwarelist=malwarelist[1:]" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "Giving a path where to store images and from where to pick byte files " 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "destination='...\\\\images...\\\\'\n", 74 | "folder='...\\\\byte files....\\\\'" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "The following piece of code does the actual work. It opens each byte file, then read each record(row) and converts hexadecimal to decimal with act as a pixel value of the image. Image is formed using Pillow library by keeping fixed width As some of the byte files are corrupted, so a check is made to store these files name for removing them from actual data." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 6, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "corrupted_files=[]\n", 91 | "for name in malwarelist:\n", 92 | " b = \".bytes\"\n", 93 | " nam=name[0].strip('\"')\n", 94 | " loc = folder+nam + b\n", 95 | " hexar = []\n", 96 | " with open(loc, 'rb') as f: \n", 97 | " for line in f:\n", 98 | " hexar.extend(int(el, 16) for el in line.split()[1:] if el != b'??')\n", 99 | " print(len(hexar)) \n", 100 | " if len(hexar)!=0:\n", 101 | " rn = len(hexar) // width\n", 102 | " fh = numpy.reshape(hexar[:rn * width], (-1, width))\n", 103 | " fh= np.uint8(fh)\n", 104 | " print(nam,' ',fh.shape)\n", 105 | " img = Image.fromarray(fh)\n", 106 | " img.save(destination+nam+\".png\")\n", 107 | " if len(hexar)==0:\n", 108 | " corrupted_files.append(nam)" 109 | ] 110 | } 111 | ], 112 | "metadata": { 113 | "kernelspec": { 114 | "display_name": "Python 3", 115 | "language": "python", 116 | "name": "python3" 117 | }, 118 | "language_info": { 119 | "codemirror_mode": { 120 | "name": "ipython", 121 | "version": 3 122 | }, 123 | "file_extension": ".py", 124 | "mimetype": "text/x-python", 125 | "name": "python", 126 | "nbconvert_exporter": "python", 127 | "pygments_lexer": "ipython3", 128 | "version": "3.7.2" 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 2 133 | } 134 | -------------------------------------------------------------------------------- /Byte/Feature extractor (CNN with pretrained encoder layers) - optimization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature extractor (CNN with pretrained encoder layers) - optimization" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Importing libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import torch\n", 24 | "import torch.nn as nn\n", 25 | "import torch.utils.data as data\n", 26 | "import numpy as np\n", 27 | "from cnn_stacking import autoencoder,autoencoder1\n", 28 | "from cnn_stacking import Cnn_Stacking\n", 29 | "from dataloader_csv import CustomDatasetFromImages" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "Checking the availability of GPU " 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n", 46 | "if torch.cuda.is_available(): \n", 47 | " print(\"gpu available\")" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "This function is used to save the state of the model" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "path_to_checkpoint='...//save_state_cnn_stack//'\n", 64 | "check_name='checkpoint.pth.tar'\n", 65 | "def save_checkpoint(state, is_best, filename,loss):\n", 66 | " \"\"\"Save checkpoint if a new best is achieved\"\"\"\n", 67 | " if is_best:\n", 68 | " print (\"=> Saving a new lowest loss : \"+str(loss))\n", 69 | " torch.save(state, filename) # save checkpoint" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Following function takes 5 parameters. Purpose of this function is to train the encoder layer of autoencoder" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 4, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "def single_auto(model,_data,num_epochs,learning_rate,w_decay):\n", 86 | " \n", 87 | " optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n", 88 | " criterion = nn.MSELoss()\n", 89 | "\n", 90 | " for epoch in range(num_epochs):\n", 91 | " loss_list=[]\n", 92 | " for images, labels in _data:\n", 93 | "\n", 94 | " images = images.to(device)\n", 95 | "\n", 96 | " outputs = model(images)\n", 97 | "\n", 98 | " loss = criterion(outputs,images)\n", 99 | " optimizer.zero_grad() \n", 100 | " \n", 101 | " l1_reg = None\n", 102 | " for W in model.parameters():\n", 103 | " if l1_reg is None:\n", 104 | " l1_reg = W.norm(p=1)\n", 105 | " else:\n", 106 | " l1_reg = l1_reg + W.norm(p=1)\n", 107 | " \n", 108 | " loss=loss+l1_reg * w_decay\n", 109 | " \n", 110 | " loss.backward(retain_graph=True)\n", 111 | " \n", 112 | " optimizer.step()\n", 113 | " loss_list.append(loss.item())\n", 114 | " print('epoch [{}/{}], mean_loss:{:.4f}'.format(epoch + 1, num_epochs,np.mean(np.array(loss_list))))\n", 115 | " return model" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "The following function takes 6 parameters. This function first pass the data from pretrained encoder layer of an autoencoder and then pass it through the autoencoder1 for the pretraining of its encoder layer" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 5, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "def auto_to_auto(model_e,model_t,_data,num_epochs,learning_rate,w_decay):\n", 132 | " optimizer = torch.optim.Adam(model_t.parameters(), lr=learning_rate)\n", 133 | " criterion = nn.MSELoss()\n", 134 | " for epoch in range(num_epochs):\n", 135 | " loss_list=[]\n", 136 | " for images, labels in _data:\n", 137 | " \n", 138 | " images = images.to(device)\n", 139 | " \n", 140 | " images=model_e.encoder(images)\n", 141 | " \n", 142 | " outputs = model_t(images)\n", 143 | "\n", 144 | " loss = criterion(outputs,images)\n", 145 | " \n", 146 | " optimizer.zero_grad() \n", 147 | " \n", 148 | " \n", 149 | " l1_reg = None\n", 150 | " for W in model_t.parameters():\n", 151 | " if l1_reg is None:\n", 152 | " l1_reg = W.norm(p=1)\n", 153 | " else:\n", 154 | " l1_reg = l1_reg + W.norm(p=1)\n", 155 | " \n", 156 | " loss=loss+l1_reg * w_decay\n", 157 | " \n", 158 | " loss.backward(retain_graph=True)\n", 159 | " \n", 160 | " optimizer.step()\n", 161 | " loss_list.append(loss.item())\n", 162 | " print('epoch [{}/{}], mean_loss:{:.4f}'\n", 163 | " .format(epoch + 1, num_epochs,np.mean(np.array(loss_list))\n", 164 | " ))\n", 165 | " return model_t" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "Loading the train and validation data" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 6, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "TRAIN_DATA_PATH = \"...//data_to_tra.csv\"\n", 182 | "VAL_DATA_PATH=\"...//data_to_val.csv\"\n", 183 | "\n", 184 | "\n", 185 | "train_data=CustomDatasetFromImages(TRAIN_DATA_PATH)\n", 186 | "train_data_loader = data.DataLoader(train_data,batch_size=20,shuffle=True)\n", 187 | "\n", 188 | "\n", 189 | "val_data=CustomDatasetFromImages(VAL_DATA_PATH)\n", 190 | "val_data_loader = data.DataLoader(val_data,batch_size=20,shuffle=True)\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "Setting the parameters" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 7, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "num_epochs,learning_rate,weight_decay = 100,0.001,1e-5" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "Trainig the autoencoders" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 8, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "auto=single_auto(autoencoder().to(device).float(),train_data_loader,num_epochs,learning_rate,weight_decay)\n", 223 | "auto1=auto_to_auto(auto,autoencoder1().to(device).float(),train_data_loader,num_epochs,learning_rate,weight_decay)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "Creating the model (stacking the pretrained encoder layer for fine tuning purpose ), defining the loss function and optimizers " 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 10, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "model=Cnn_Stacking(auto,auto1).to(device).float()\n", 240 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.001,weight_decay=1e-3)\n", 241 | "criterion = nn.CrossEntropyLoss()" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "In this section, stacked encoder layers are fine tunned" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 11, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "total_step = len(train_data_loader)\n", 258 | "lowest_loss=0.300\n", 259 | "\n", 260 | "for epoch in range(num_epochs):\n", 261 | "\n", 262 | " for i, (images, labels) in enumerate(train_data_loader):\n", 263 | " \n", 264 | " images = images.to(device)\n", 265 | " labels = labels\n", 266 | " labels = labels.to(device)\n", 267 | "\n", 268 | " outputs = model(images)\n", 269 | "\n", 270 | " loss = criterion(outputs,labels)\n", 271 | "\n", 272 | " optimizer.zero_grad() \n", 273 | " loss.backward()\n", 274 | " optimizer.step()\n", 275 | "\n", 276 | " val_loss_list=[]\n", 277 | " for y,(val_images,val_labels) in enumerate(val_data_loader):\n", 278 | "\n", 279 | " val_images=val_images.to(device)\n", 280 | "\n", 281 | " val_labels=val_labels\n", 282 | " val_labels=val_labels.to(device)\n", 283 | "\n", 284 | " val_outputs=model(val_images)\n", 285 | "\n", 286 | " val_loss = criterion(val_outputs, val_labels)\n", 287 | " val_loss_list.append(val_loss.item())\n", 288 | " \n", 289 | " mean_loss= np.mean(np.array(val_loss_list))\n", 290 | "\n", 291 | " is_best= bool(mean_loss Saving a new lowest loss : \"+str(loss))\n", 134 | " torch.save(state, filename) # save checkpoint" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "This is the section where training takes place" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 8, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "total_step = len(train_data_loader)\n", 151 | "lowest_loss=0.800\n", 152 | "\n", 153 | "for epoch in range(num_epochs):\n", 154 | " for i, (images, labels) in enumerate(train_data_loader):\n", 155 | " images = images.to(device)\n", 156 | " labels = labels.to(device)\n", 157 | "# Forward pass\n", 158 | " outputs = model(images)\n", 159 | " _, predicted = torch.max(outputs.data, 1)\n", 160 | " loss = criterion(outputs, labels)\n", 161 | " x=np.array((predicted==labels).cpu())\n", 162 | "# Backward and optimize\n", 163 | " optimizer.zero_grad() \n", 164 | " loss.backward()\n", 165 | " optimizer.step()\n", 166 | " val_loss_list=[]\n", 167 | " val_acc_list =[]\n", 168 | "# Doing validation \n", 169 | " for y,(images,labels) in enumerate(val_data_loader):\n", 170 | " images=images.to(device)\n", 171 | " labels=labels.to(device)\n", 172 | " outputs=model(images)\n", 173 | " _, predicted = torch.max(outputs.data, 1)\n", 174 | " val_loss = criterion(outputs, labels)\n", 175 | " val_loss_list.append(val_loss.item())\n", 176 | " val_x=np.array((predicted==labels).cpu())\n", 177 | " val_acc=(sum(val_x))*100/len(val_x)\n", 178 | " val_acc_list.append(val_acc)\n", 179 | " mean_loss= np.mean(np.array(val_loss_list))\n", 180 | " mean_acc = np.mean(np.array(val_acc_list))\n", 181 | " is_best= bool(mean_loss Saving a new lowest loss : \"+str(loss))\n", 240 | " torch.save(state, filename) # save checkpoint\n", 241 | "\n", 242 | "\n", 243 | "total_step = len(train_data_loader)\n", 244 | "lowest_loss=0.200\n", 245 | "checkpoint_step= int(total_step/8)\n", 246 | "\n", 247 | "\n", 248 | "for epoch in range(num_epochs):\n", 249 | "\n", 250 | " for i, (images,labels) in enumerate(train_data_loader):\n", 251 | " images = images.to(device)\n", 252 | " labels = labels.to(device)\n", 253 | " \n", 254 | " # Forward pass\n", 255 | " outputs = model(images)\n", 256 | "\n", 257 | " _, predicted = torch.max(outputs[:,0].data, 1)\n", 258 | " \n", 259 | " loss = criterion(outputs[:,0], labels)\n", 260 | " \n", 261 | " x=np.array((predicted==labels).cpu())\n", 262 | " # Backward and optimize\n", 263 | " optimizer.zero_grad() \n", 264 | " loss.backward()\n", 265 | " optimizer.step()\n", 266 | " val_loss_list=[]\n", 267 | " val_acc_list =[]\n", 268 | " \n", 269 | " for y,(images,labels) in enumerate(val_data_loader):\n", 270 | " images=images.to(device)\n", 271 | " labels=labels.to(device)\n", 272 | " outputs=model(images)\n", 273 | " _, predicted = torch.max(outputs[:,0].data, 1)\n", 274 | " val_loss = criterion(outputs[:,0], labels)\n", 275 | " val_loss_list.append(val_loss.item())\n", 276 | " \n", 277 | " val_x=np.array((predicted==labels).cpu())\n", 278 | " val_acc=(sum(val_x))*100/len(val_x)\n", 279 | " val_acc_list.append(val_acc)\n", 280 | " mean_loss= np.mean(np.array(val_loss_list))\n", 281 | " mean_acc = np.mean(np.array(val_acc_list))\n", 282 | " is_best= bool(mean_loss