├── ASM
    ├── ASM file to TEXT file (removing unwanted data).ipynb
    ├── CSV data based on frequency Count of Opcode.ipynb
    ├── Data Splitting.ipynb
    ├── Data Visualization (ASM).ipynb
    ├── Feature Selection.ipynb
    ├── Min-max normalization(opcodes).ipynb
    ├── Read me
    ├── Reducing opcode feature.ipynb
    ├── opc_list.npy
    └── sect_list.npy
├── Byte
    ├── Byte file to Image Conversion.ipynb
    ├── Data Visualization (Byte).ipynb
    ├── Feature extractor (CNN with pretrained encoder layers) - optimization.ipynb
    ├── Feature extractor (CNN-optimization).ipynb
    ├── Images to Comma-Separated Values(csv).ipynb
    ├── Min-max normalization(Byte).ipynb
    ├── Read me
    ├── Resizing images to 32x32.ipynb
    ├── Stratified sampling.ipynb
    ├── cnn_stacking.py
    ├── cnn_v1.py
    └── dataloader_csv.py
├── Hybrid (Final)
    ├── ANN-Results.ipynb
    ├── Creating hybrid dataset.ipynb
    ├── Data Visualization (Hybrid).ipynb
    ├── Min-max normalization (hybrid dataset).ipynb
    ├── Read me
    ├── ann_hybrid.py
    ├── dataloader_csv.py
    └── final_hybrid_csv.py
└── README.md


/ASM/ASM file to TEXT file (removing unwanted data).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# ASM file to TEXT file (removing unwanted data)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Importing libraries and initilizing variables"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from pyparsing import oneOf,Word,hexnums,Optional,WordEnd,alphas,alphanums\n",
 24 |     "import numpy as np\n",
 25 |     "\n",
 26 |     "c_files=['58kxhXouHzFd4g3rmInB','6tfw0xSL2FNHOCJBdlaA','a9oIzfw03ED4lTBCt52Y','cf4nzsoCmudt1kwleOTI','d0iHC6ANYGon7myPFzBe','da3XhOZzQEbKVtLgMYWv','fRLS3aKkijp4GH0Ds6Pv','IidxQvXrlBkWPZAfcqKT']\n",
 27 |     "\n",
 28 |     "data_path='...\\\\train\\\\'\n",
 29 |     "to_save=\"...\\\\asm_to_text\\\\\"\n",
 30 |     "\n",
 31 |     "section_list=[]\n",
 32 |     "opcode_list=[]\n",
 33 |     "files_not_parsed=[]"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "Reading ID's and labels from csv"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "malwarelist=np.genfromtxt('trainLabels.csv',delimiter=\",\",dtype=str)\n",
 50 |     "malwarelist=malwarelist[1:]"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "Making an ASM file line parser"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 6,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "hex_integer = Word(hexnums) + WordEnd()\n",
 67 |     "line = hex_integer + Optional((hex_integer*(1,))(\"instructions\") + Word(alphas,alphanums)(\"opcode\"))"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "This is the main part of the code, as it takes several days to process all files so it is designed in such a way that you can run and stop this section again and again without losing any data."
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 7,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "for name in malwarelist:\n",
 84 |     "    b = \".bytes\"\n",
 85 |     "    nam=name[0].strip('\"')\n",
 86 |     "    \n",
 87 |     "\n",
 88 |     "# =========Run following commands only if you are not runing this code for the first time =========\n",
 89 |     "\n",
 90 |     "#    c_files=np.load('c_files.npy')\n",
 91 |     "#    files_not_parsed=np.load('files_not_parsed.npy')\n",
 92 |     "#    c_files=c_files.tolist()\n",
 93 |     "#    files_not_parsed=files_not_parsed.tolist()\n",
 94 |     "#    section_list=np.load('sect_list.npy')\n",
 95 |     "#    opcode_list=np.load('opc_list.npy')\n",
 96 |     "#    section_list=section_list.tolist()\n",
 97 |     "#    opcode_list=opcode_list.tolist()\n",
 98 |     "# =================================================================================================\n",
 99 |     "\n",
100 |     "    \n",
101 |     "    if nam not in c_files:\n",
102 |     "        if nam not in files_not_parsed:\n",
103 |     "            nam=str(nam)\n",
104 |     "            nam_asm=data_path+nam+\".asm\"\n",
105 |     "            nam_txt=to_save+nam+\".txt\"\n",
106 |     "            asm_list=[]\n",
107 |     "            try:\n",
108 |     "                with open(nam_asm,encoding='ISO-8859-1') as f:\n",
109 |     "                    lines=f.readlines()\n",
110 |     "                    for source_line in lines:\n",
111 |     "                        if source_line==\"\\n\":\n",
112 |     "                            continue        \n",
113 |     "                        section=source_line[:source_line.find(':')+1]\n",
114 |     "                        if section not in section_list:\n",
115 |     "                            section_list.append(str(section))   \n",
116 |     "                        result = line.parseString(source_line[source_line.find(':')+1:])\n",
117 |     "                        if \"opcode\" in result:\n",
118 |     "                            opcod=result.opcode\n",
119 |     "                            x=section+' '+opcod\n",
120 |     "                            asm_list.append(x)\n",
121 |     "                            if opcod not in opcode_list:\n",
122 |     "                                opcode_list.append(str(result.opcode))\n",
123 |     "                with open(nam_txt, \"a\") as file_prime:\n",
124 |     "                    for i in asm_list:\n",
125 |     "                        file_prime.write(str(i)+ '\\n')\n",
126 |     "                        \n",
127 |     "                c_files.append(nam) \n",
128 |     "            except:\n",
129 |     "                files_not_parsed.append(nam)\n",
130 |     "                \n",
131 |     "    np.save('c_files.npy',c_files)\n",
132 |     "    np.save('files_not_parsed.npy',files_not_parsed)\n",
133 |     "    np.save('sect_list.npy', section_list)\n",
134 |     "    np.save('opc_list.npy', opcode_list)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "Note: We also parsed the files individually that are not parsed according to this scheme and update the sect_list and opc_list. You are given the updated sect_list and opc_list."
142 |    ]
143 |   }
144 |  ],
145 |  "metadata": {
146 |   "kernelspec": {
147 |    "display_name": "Python 3",
148 |    "language": "python",
149 |    "name": "python3"
150 |   },
151 |   "language_info": {
152 |    "codemirror_mode": {
153 |     "name": "ipython",
154 |     "version": 3
155 |    },
156 |    "file_extension": ".py",
157 |    "mimetype": "text/x-python",
158 |    "name": "python",
159 |    "nbconvert_exporter": "python",
160 |    "pygments_lexer": "ipython3",
161 |    "version": "3.7.2"
162 |   }
163 |  },
164 |  "nbformat": 4,
165 |  "nbformat_minor": 2
166 | }
167 | 


--------------------------------------------------------------------------------
/ASM/CSV data based on frequency Count of Opcode.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Opcode frequency Count based CSV data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "The purpose of this code is to make CSV data that contain the frequency count of all opcode against each sample"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Importing libraries and initializing varivables"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import numpy as np\n",
 31 |     "import string\n",
 32 |     "import csv\n",
 33 |     "\n",
 34 |     "c_files=['58kxhXouHzFd4g3rmInB','6tfw0xSL2FNHOCJBdlaA','a9oIzfw03ED4lTBCt52Y','cf4nzsoCmudt1kwleOTI','d0iHC6ANYGon7myPFzBe','da3XhOZzQEbKVtLgMYWv','fRLS3aKkijp4GH0Ds6Pv','IidxQvXrlBkWPZAfcqKT']\n",
 35 |     "path = '..//asm_to_text//'\n"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Reading ID's and Labels from CSV"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "malwarelist=np.genfromtxt('trainLabels.csv',delimiter=\",\",dtype=str)\n",
 52 |     "malwarelist=malwarelist[1:]"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "Making the first row that will be the field names in the CSV"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 3,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "opc=np.load('opc_list.npy')\n",
 69 |     "opc=opc.tolist()\n",
 70 |     "opc.append('ID')\n",
 71 |     "opc.append('LABEL')\n",
 72 |     "data_to_write=[]\n",
 73 |     "data_to_write.append(opc)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "This is the main section of the code that read text files and counts frequencies of different opcounts in each sample and stores them to the data_to_write list along with their ID's and labels "
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 4,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "for name in malwarelist: \n",
 90 |     "    \n",
 91 |     "    nam=name[0].strip('\"')\n",
 92 |     "    nam=str(nam)\n",
 93 |     "    count=count+1\n",
 94 |     "    print(nam,' ',count)\n",
 95 |     "    temp=[0]*len(opc)    \n",
 96 |     "    nam=path+nam+\".txt\"\n",
 97 |     "    if name[0].strip('\"') not in wrong: \n",
 98 |     "        with open(nam) as f:\n",
 99 |     "            lis=f.readlines()\n",
100 |     "            for source_lin in lis:\n",
101 |     "                if source_lin==\"\\n\":\n",
102 |     "                    continue    \n",
103 |     "                opco=source_lin[source_lin.find(':')+2:source_lin.find('\\\\')]         \n",
104 |     "                temp[opc.index(opco)]=temp[opc.index(opco)]+1\n",
105 |     "#=================In case of section use this=============================================\n",
106 |     "#                section=source_lin[:source_lin.find(':')+1]\n",
107 |     "#                section=section.translate({ord(c): None for c in string.whitespace})\n",
108 |     "#========================================================================================\n",
109 |     "            temp[opc.index('ID')]=name[0].strip('\"')\n",
110 |     "            temp[opc.index('LABEL')]=name[1]\n",
111 |     "    data_to_write.append(temp)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "In this section, CSV is generated"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 6,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "myFile = open('data_opcode.csv', 'w', newline='')  \n",
128 |     "with myFile:  \n",
129 |     "    writer = csv.writer(myFile)\n",
130 |     "    writer.writerows(data_to_write)"
131 |    ]
132 |   }
133 |  ],
134 |  "metadata": {
135 |   "kernelspec": {
136 |    "display_name": "Python 3",
137 |    "language": "python",
138 |    "name": "python3"
139 |   },
140 |   "language_info": {
141 |    "codemirror_mode": {
142 |     "name": "ipython",
143 |     "version": 3
144 |    },
145 |    "file_extension": ".py",
146 |    "mimetype": "text/x-python",
147 |    "name": "python",
148 |    "nbconvert_exporter": "python",
149 |    "pygments_lexer": "ipython3",
150 |    "version": "3.7.2"
151 |   }
152 |  },
153 |  "nbformat": 4,
154 |  "nbformat_minor": 2
155 | }
156 | 


--------------------------------------------------------------------------------
/ASM/Data Splitting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Splitting"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this code same splitting is done as with images data. Same samples are put in validation, test, train and train full CSV as in the case of images dataset."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "import csv"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "Initializing the variables"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "path_tr='data_to_tra.csv'\n",
 41 |     "path_futr='data_to_traFull.csv'\n",
 42 |     "path_te='data_to_test.csv'\n",
 43 |     "path_val='data_to_val.csv'\n",
 44 |     "path_opc='500_reduce_opcode.csv'"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "Reading the data from CSV"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "original=np.genfromtxt(path_opc,delimiter=\",\",dtype=str)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "Making training data CSV"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "temp=np.genfromtxt(path_tr,delimiter=\",\",dtype=str)\n",
 77 |     "\n",
 78 |     "to_csv=[]\n",
 79 |     "\n",
 80 |     "to_csv.append(original[0].tolist())\n",
 81 |     "\n",
 82 |     "for i in original:\n",
 83 |     "    if i[-2] in temp[:,-2]:\n",
 84 |     "        to_csv.append(i.tolist())\n",
 85 |     "\n",
 86 |     "myFile = open('n_500_data_to_tra.csv', 'w', newline='')  \n",
 87 |     "\n",
 88 |     "with myFile:  \n",
 89 |     "    writer = csv.writer(myFile)\n",
 90 |     "    writer.writerows(to_csv)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "Making train full data CSV"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 5,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "temp=np.genfromtxt(path_futr,delimiter=\",\",dtype=str)\n",
107 |     "\n",
108 |     "to_csv=[]\n",
109 |     "\n",
110 |     "to_csv.append(original[0].tolist())\n",
111 |     "\n",
112 |     "for i in original:\n",
113 |     "    if i[-2] in temp[:,-2]:\n",
114 |     "        to_csv.append(i.tolist())\n",
115 |     "\n",
116 |     "myFile = open('n_500_data_to_traFull.csv', 'w', newline='')  \n",
117 |     "\n",
118 |     "with myFile:  \n",
119 |     "    writer = csv.writer(myFile)\n",
120 |     "    writer.writerows(to_csv)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Making test CSV"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 6,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "temp=np.genfromtxt(path_te,delimiter=\",\",dtype=str)\n",
137 |     "\n",
138 |     "to_csv=[]\n",
139 |     "\n",
140 |     "to_csv.append(original[0].tolist())\n",
141 |     "\n",
142 |     "for i in original:\n",
143 |     "    if i[-2] in temp[:,-2]:\n",
144 |     "        to_csv.append(i.tolist())\n",
145 |     "\n",
146 |     "myFile = open('n_500_data_to_test.csv', 'w', newline='')  \n",
147 |     "\n",
148 |     "with myFile:  \n",
149 |     "    writer = csv.writer(myFile)\n",
150 |     "    writer.writerows(to_csv)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Making validation CSV"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 7,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "temp=np.genfromtxt(path_val,delimiter=\",\",dtype=str)\n",
167 |     "\n",
168 |     "to_csv=[]\n",
169 |     "\n",
170 |     "to_csv.append(original[0].tolist())\n",
171 |     "\n",
172 |     "for i in original:\n",
173 |     "    if i[-2] in temp[:,-2]:\n",
174 |     "        to_csv.append(i.tolist())\n",
175 |     "\n",
176 |     "myFile = open('n_500_data_to_val.csv', 'w', newline='')  \n",
177 |     "\n",
178 |     "with myFile:  \n",
179 |     "    writer = csv.writer(myFile)\n",
180 |     "    writer.writerows(to_csv)"
181 |    ]
182 |   }
183 |  ],
184 |  "metadata": {
185 |   "kernelspec": {
186 |    "display_name": "Python 3",
187 |    "language": "python",
188 |    "name": "python3"
189 |   },
190 |   "language_info": {
191 |    "codemirror_mode": {
192 |     "name": "ipython",
193 |     "version": 3
194 |    },
195 |    "file_extension": ".py",
196 |    "mimetype": "text/x-python",
197 |    "name": "python",
198 |    "nbconvert_exporter": "python",
199 |    "pygments_lexer": "ipython3",
200 |    "version": "3.7.2"
201 |   }
202 |  },
203 |  "nbformat": 4,
204 |  "nbformat_minor": 2
205 | }
206 | 


--------------------------------------------------------------------------------
/ASM/Feature Selection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Feature Selection"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "The following code is used for feature selection using wrapper based feature selection mechanism where SVM with rbf kernel is used as a classifier. Feature selection and backward elimination both techquies are used. top_feature variable is used for selecting the different number of opcodes. It finds out that top 116 features help better in classification. Following is the code where we get features.  "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Importing libraries and initializing variables"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import numpy as np\n",
 31 |     "import matplotlib.pyplot as plt\n",
 32 |     "from sklearn import svm\n",
 33 |     "from sklearn.metrics import confusion_matrix\n",
 34 |     "from sklearn.preprocessing import label_binarize\n",
 35 |     "from sklearn.multiclass import OneVsRestClassifier\n",
 36 |     "path_tr='nn_500_data_to_tra.csv'\n",
 37 |     "path_futr='nn_500_data_to_traFull.csv'\n",
 38 |     "path_te='nn_500_data_to_test.csv'\n",
 39 |     "path_val='nn_500_data_to_val.csv'\n",
 40 |     "top_feature=116"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "Reading data"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 2,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "train = np.genfromtxt(path_tr, delimiter=\",\", dtype=str)\n",
 57 |     "val = np.genfromtxt(path_val, delimiter=\",\", dtype=str)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "Data processing"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "x_data  = train[1:,0:top_feature]\n",
 74 |     "x_label = train[1:,-1]\n",
 75 |     "x_data=x_data.astype(np.float)\n",
 76 |     "x_label=x_label.astype(np.int)\n",
 77 |     "# Binarize the output\n",
 78 |     "x_label = label_binarize(x_label, classes=[1,2,3,4,5,6,7,8,9])\n",
 79 |     "\n",
 80 |     "y_data  = val[1:,0:top_feature]\n",
 81 |     "y_data=y_data.astype(np.float)\n",
 82 |     "y_label = val[1:,-1]\n",
 83 |     "y_label=y_label.astype(np.int)\n",
 84 |     "woby=y_label\n",
 85 |     "# Binarize the output\n",
 86 |     "y_label = label_binarize(y_label, classes=[1,2,3,4,5,6,7,8,9])"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Applying custom grid search. optimize c and g values store in opt_c,opt_g respectively"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 4,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "c=[0.01,0.1,1,10,100]\n",
103 |     "g=[0.01,0.1,1,10,100]\n",
104 |     "loss=100\n",
105 |     "opt_c=0\n",
106 |     "opt_g=0\n",
107 |     "for i in c:\n",
108 |     "    for j in g:    \n",
109 |     "        clf = OneVsRestClassifier(svm.SVC(kernel='rbf',C=i,gamma=j, probability=True))\n",
110 |     "        clf_f = clf.fit(x_data, x_label)\n",
111 |     "        y_score=clf_f.decision_function(y_data)\n",
112 |     "        y_predict_prob=clf_f.predict_proba(y_data)\n",
113 |     "        y_predict=clf_f.predict(y_data)\n",
114 |     "        pre=y_predict.argmax(1)\n",
115 |     "        pre=pre+1\n",
116 |     "        a=log_loss(y_label,y_predict_prob)\n",
117 |     "        if a<loss:\n",
118 |     "            opt_c=i\n",
119 |     "            opt_g=j\n",
120 |     "            loss=a\n",
121 |     "        print(confusion_matrix(woby,pre))\n",
122 |     "        print('log loss: ',a,'  C: ',i,'  G: ',j)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "Reading data from train full data and test data"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 5,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "train = np.genfromtxt(path_futr, delimiter=\",\", dtype=str)\n",
139 |     "test = np.genfromtxt(path_te, delimiter=\",\", dtype=str)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "Data processing"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 6,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "x_data  = train[1:,0:top_feature]\n",
156 |     "x_label = train[1:,-1]\n",
157 |     "x_data=x_data.astype(np.float)\n",
158 |     "x_label=x_label.astype(np.int)\n",
159 |     "# Binarize the output\n",
160 |     "x_label = label_binarize(x_label, classes=[1,2,3,4,5,6,7,8,9])\n",
161 |     "\n",
162 |     "\n",
163 |     "y_data  = test[1:,0:top_feature]\n",
164 |     "y_data=y_data.astype(np.float)\n",
165 |     "\n",
166 |     "y_label = test[1:,-1]\n",
167 |     "y_label=y_label.astype(np.int)\n",
168 |     "woby=y_label\n",
169 |     "# Binarize the output\n",
170 |     "y_label = label_binarize(y_label, classes=[1,2,3,4,5,6,7,8,9])"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "Training the classifier(SVM) on optimized parameters"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 7,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "clf = OneVsRestClassifier(svm.SVC(kernel='rbf',C=opt_c,gamma=opt_g, probability=True))\n",
187 |     "clf_f = clf.fit(x_data, x_label)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "Calculating the log_loss and confusion matrix"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 8,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "y_score=clf_f.decision_function(y_data)\n",
204 |     "y_predict_prob=clf_f.predict_proba(y_data)\n",
205 |     "y_predict=clf_f.predict(y_data)\n",
206 |     "pre=y_predict.argmax(1)\n",
207 |     "pre=pre+1\n",
208 |     "\n",
209 |     "a=log_loss(y_label,y_predict_prob)\n",
210 |     "\n",
211 |     "print(confusion_matrix(woby,pre)) \n",
212 |     "\n",
213 |     "print('log loss: ',a,'  C: ',opt_c,'  G: ',opt_g)"
214 |    ]
215 |   }
216 |  ],
217 |  "metadata": {
218 |   "kernelspec": {
219 |    "display_name": "Python 3",
220 |    "language": "python",
221 |    "name": "python3"
222 |   },
223 |   "language_info": {
224 |    "codemirror_mode": {
225 |     "name": "ipython",
226 |     "version": 3
227 |    },
228 |    "file_extension": ".py",
229 |    "mimetype": "text/x-python",
230 |    "name": "python",
231 |    "nbconvert_exporter": "python",
232 |    "pygments_lexer": "ipython3",
233 |    "version": "3.7.2"
234 |   }
235 |  },
236 |  "nbformat": 4,
237 |  "nbformat_minor": 2
238 | }
239 | 


--------------------------------------------------------------------------------
/ASM/Min-max normalization(opcodes).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Min-max normalization(opcodes)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Importing libraries and initializing variables"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "import csv\n",
 25 |     "path_tr='n_500_data_to_tra.csv'\n",
 26 |     "path_futr='n_500_data_to_traFull.csv'\n",
 27 |     "path_te='n_500_data_to_test.csv'\n",
 28 |     "path_val='n_500_data_to_val.csv'"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "In this section of code train data is normalize with respect to its max and min values against each feature"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "new_train_data = np.genfromtxt(path_tr, delimiter=\",\", dtype=str)\n",
 45 |     "tra_data= new_train_data[1:,0:-2]\n",
 46 |     "tra_lab=new_train_data[1:,-1]\n",
 47 |     "tra_name=new_train_data[1:,-2]\n",
 48 |     "tra_lab=tra_lab.astype(np.int)\n",
 49 |     "tra_data=tra_data.astype(np.float)\n",
 50 |     "tra_min=np.min(tra_data,axis=0)\n",
 51 |     "tra_max=np.max(tra_data,axis=0)\n",
 52 |     "tra_dom=np.subtract(tra_max,tra_min)\n",
 53 |     "tra_dom=np.where(tra_dom == 0, 1, tra_dom)\n",
 54 |     "img = np.subtract(tra_data,tra_min)\n",
 55 |     "img = np.divide(img, tra_dom)\n",
 56 |     "img= img.tolist()\n",
 57 |     "\n",
 58 |     "for i in range(len(tra_lab)):\n",
 59 |     "    img[i].append(tra_name[i])\n",
 60 |     "    img[i].append(tra_lab[i])\n",
 61 |     "\n",
 62 |     "\n",
 63 |     "myFile = open('nn_500_data_to_tra.csv', 'w', newline='')  \n",
 64 |     "\n",
 65 |     "with myFile:  \n",
 66 |     "    writer = csv.writer(myFile)\n",
 67 |     "    writer.writerows(img)\n"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "Validation data is normalized with respect to train data max and min values against each feature"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 3,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "new_valid_data = np.genfromtxt(path_val, delimiter=\",\", dtype=str)\n",
 84 |     "valid_data= new_valid_data[1:,0:-2]\n",
 85 |     "valid_name=new_valid_data[1:,-2]\n",
 86 |     "valid_lab=new_valid_data[1:,-1]\n",
 87 |     "valid_lab=valid_lab.astype(np.int)\n",
 88 |     "valid_data=valid_data.astype(np.float)\n",
 89 |     "img = np.subtract(valid_data,tra_min)\n",
 90 |     "img = np.divide(img, tra_dom)\n",
 91 |     "img= img.tolist()\n",
 92 |     "\n",
 93 |     "for i in range(len(valid_lab)):\n",
 94 |     "    img[i].append(valid_name[i])\n",
 95 |     "    img[i].append(valid_lab[i])\n",
 96 |     "\n",
 97 |     "\n",
 98 |     "myFile = open('nn_500_data_to_val.csv', 'w', newline='')  \n",
 99 |     "\n",
100 |     "with myFile:  \n",
101 |     "    writer = csv.writer(myFile)\n",
102 |     "    writer.writerows(img)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "In this section of code trainfull data is normalized with respect to its max and min values against each feature"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 4,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "new_x_data = np.genfromtxt(path_futr, delimiter=\",\", dtype=str)\n",
119 |     "x_data= new_x_data[1:,0:-2]\n",
120 |     "x_name=new_x_data[1:,-2]\n",
121 |     "x_lab=new_x_data[1:,-1]\n",
122 |     "x_lab=x_lab.astype(np.int)\n",
123 |     "x_data=x_data.astype(np.float)\n",
124 |     "x_min=np.min(x_data,axis=0)\n",
125 |     "x_max=np.max(x_data,axis=0)\n",
126 |     "x_dom=np.subtract(x_max,x_min)\n",
127 |     "x_dom=np.where(x_dom == 0, 1, x_dom)\n",
128 |     "img = np.subtract(x_data,x_min)\n",
129 |     "img = np.divide(img, x_dom)\n",
130 |     "img= img.tolist()\n",
131 |     "\n",
132 |     "for i in range(len(x_lab)):\n",
133 |     "    img[i].append(x_name[i])\n",
134 |     "    img[i].append(x_lab[i])\n",
135 |     "\n",
136 |     "\n",
137 |     "myFile = open('nn_500_data_to_traFull.csv', 'w', newline='')  \n",
138 |     "\n",
139 |     "with myFile:  \n",
140 |     "    writer = csv.writer(myFile)\n",
141 |     "    writer.writerows(img)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "Test data is normalized with respect to trainfull data max and min values against each feature"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 5,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "new_test_data = np.genfromtxt(path_te, delimiter=\",\", dtype=str)\n",
158 |     "test_data= new_test_data[1:,0:-2]\n",
159 |     "test_lab=new_test_data[1:,-1]\n",
160 |     "test_name=new_test_data[1:,-2]\n",
161 |     "test_lab=test_lab.astype(np.int)\n",
162 |     "test_data=test_data.astype(np.float)\n",
163 |     "img = np.subtract(test_data,x_min)\n",
164 |     "img = np.divide(img, x_dom)\n",
165 |     "img= img.tolist()\n",
166 |     "\n",
167 |     "for i in range(len(test_lab)):\n",
168 |     "    img[i].append(test_name[i])\n",
169 |     "    img[i].append(test_lab[i])\n",
170 |     "\n",
171 |     "\n",
172 |     "myFile = open('nn_500_data_to_test.csv', 'w', newline='')  \n",
173 |     "\n",
174 |     "with myFile:  \n",
175 |     "    writer = csv.writer(myFile)\n",
176 |     "    writer.writerows(img)\n"
177 |    ]
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.7.2"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 2
201 | }
202 | 


--------------------------------------------------------------------------------
/ASM/Read me:
--------------------------------------------------------------------------------
 1 | The squence of these tutorial is given below:
 2 | 
 3 | 1. ASM file to TEXT file (removing unwanted data)
 4 | 
 5 | 2. CSV data based on frequency Count of Opcode
 6 | 
 7 | 3. Reducing opcode feature 
 8 | 
 9 | 4. Data Splitting
10 | 
11 | 5. Min-max normalization
12 | 
13 | 6. Feature Selection
14 | 
15 | The preprocessed data of ASM files plus the data use in these tutorial can be found on the ASM folder of following drive link:
16 | 
17 | https://drive.google.com/drive/folders/1s7EC4s_-hP9q5vEhs-3vAubspcZbBADK?usp=sharing
18 | 


--------------------------------------------------------------------------------
/ASM/Reducing opcode feature.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Reducing opcode Features"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this part, we reduce the number of opcodes whose overall frequencies are less than 500 for reducing the size of CSV"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Importing libraries"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import numpy as np\n",
 31 |     "import pandas as pd\n",
 32 |     "import csv"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "Reading values from data_opcode. As its size is too large, it is read in the chunk of 1000 and concatenate at the end"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "size=1000\n",
 49 |     "b=pd.read_csv('data_opcode.csv',chunksize=size)\n",
 50 |     "data_list=[]\n",
 51 |     "for i in b:\n",
 52 |     "    data_list.append(i)\n",
 53 |     "data_pd = pd.DataFrame()\n",
 54 |     "for chunk in data_list:\n",
 55 |     "    data_pd = pd.concat([data_pd,chunk])\n",
 56 |     "data_arr=data_pd.values"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "In this section indexes of opcode fields extracted whose overall frequencies are less than 500"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "x=data_arr[:,0:-2]\n",
 73 |     "x=np.sum(x,axis=0)\n",
 74 |     "index_list=[]\n",
 75 |     "opcode_freq=500\n",
 76 |     "for i,j in enumerate(x):\n",
 77 |     "    if j>=opcode_freq:\n",
 78 |     "        index_list.append(i)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "In this section row for field name of CSV is created by eliminating the less frequent opcodes"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 4,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "data_to_list=[]\n",
 95 |     "names=[]\n",
 96 |     "for i,j in enumerate(data_pd):\n",
 97 |     "    if i in index_list:\n",
 98 |     "        names.append(j)\n",
 99 |     "names.append('ID')\n",
100 |     "names.append('LABELS')\n",
101 |     "data_to_list.append(names)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "Here data(selected opcodes) of samples copied to data_to_list along with their names and labels"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 5,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "for i in data_arr:\n",
118 |     "    temp=[]\n",
119 |     "    for j in index_list:\n",
120 |     "        temp.append(i[j])\n",
121 |     "            \n",
122 |     "    temp.append(i[-2])\n",
123 |     "    temp.append(i[-1])\n",
124 |     "    data_to_list.append(temp)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "Saving new data to CSV"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 6,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "myFile = open('500_reduce_opcode.csv', 'w', newline='')  \n",
141 |     "with myFile:  \n",
142 |     "    writer = csv.writer(myFile)\n",
143 |     "    writer.writerows(data_to_list)"
144 |    ]
145 |   }
146 |  ],
147 |  "metadata": {
148 |   "kernelspec": {
149 |    "display_name": "Python 3",
150 |    "language": "python",
151 |    "name": "python3"
152 |   },
153 |   "language_info": {
154 |    "codemirror_mode": {
155 |     "name": "ipython",
156 |     "version": 3
157 |    },
158 |    "file_extension": ".py",
159 |    "mimetype": "text/x-python",
160 |    "name": "python",
161 |    "nbconvert_exporter": "python",
162 |    "pygments_lexer": "ipython3",
163 |    "version": "3.7.2"
164 |   }
165 |  },
166 |  "nbformat": 4,
167 |  "nbformat_minor": 2
168 | }
169 | 


--------------------------------------------------------------------------------
/ASM/opc_list.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cyberhunters/Malware-Detection-Using-Machine-Learning/d07b19ad409582493e7042b83cbb92a0e3906197/ASM/opc_list.npy


--------------------------------------------------------------------------------
/ASM/sect_list.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cyberhunters/Malware-Detection-Using-Machine-Learning/d07b19ad409582493e7042b83cbb92a0e3906197/ASM/sect_list.npy


--------------------------------------------------------------------------------
/Byte/Byte file to Image Conversion.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Byte Files to Image Conversion"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Importing libraries "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from PIL import Image\n",
 24 |     "import numpy"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "Fixing the width of images. As each byte file size varies so, we have to fix either its width or height for better visualization "
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "width = 1366"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "Reading the data (names and labels) from csv"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "malwarelist=np.genfromtxt('trainLabels.csv',delimiter=\",\",dtype=str)\n",
 57 |     "malwarelist=malwarelist[1:]"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "Giving a path where to store images and from where to pick byte files "
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 5,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "destination='...\\\\images...\\\\'\n",
 74 |     "folder='...\\\\byte files....\\\\'"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "The following piece of code does the actual work. It opens each byte file, then read each record(row) and converts hexadecimal to decimal with act as a pixel value of the image. Image is formed using Pillow library by keeping fixed width As some of the byte files are corrupted, so a check is made to store these files name for removing them from actual data."
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 6,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "corrupted_files=[]\n",
 91 |     "for name in malwarelist:\n",
 92 |     "    b = \".bytes\"\n",
 93 |     "    nam=name[0].strip('\"')\n",
 94 |     "    loc = folder+nam + b\n",
 95 |     "    hexar = []\n",
 96 |     "    with open(loc, 'rb') as f: \n",
 97 |     "        for line in f:\n",
 98 |     "            hexar.extend(int(el, 16) for el in line.split()[1:] if el != b'??')\n",
 99 |     "    print(len(hexar)) \n",
100 |     "    if len(hexar)!=0:\n",
101 |     "        rn = len(hexar) // width\n",
102 |     "        fh = numpy.reshape(hexar[:rn * width], (-1, width))\n",
103 |     "        fh=  np.uint8(fh)\n",
104 |     "        print(nam,'  ',fh.shape)\n",
105 |     "        img = Image.fromarray(fh)\n",
106 |     "        img.save(destination+nam+\".png\")\n",
107 |     "    if len(hexar)==0:\n",
108 |     "        corrupted_files.append(nam)"
109 |    ]
110 |   }
111 |  ],
112 |  "metadata": {
113 |   "kernelspec": {
114 |    "display_name": "Python 3",
115 |    "language": "python",
116 |    "name": "python3"
117 |   },
118 |   "language_info": {
119 |    "codemirror_mode": {
120 |     "name": "ipython",
121 |     "version": 3
122 |    },
123 |    "file_extension": ".py",
124 |    "mimetype": "text/x-python",
125 |    "name": "python",
126 |    "nbconvert_exporter": "python",
127 |    "pygments_lexer": "ipython3",
128 |    "version": "3.7.2"
129 |   }
130 |  },
131 |  "nbformat": 4,
132 |  "nbformat_minor": 2
133 | }
134 | 


--------------------------------------------------------------------------------
/Byte/Feature extractor (CNN with pretrained encoder layers) - optimization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Feature extractor (CNN with pretrained encoder layers) - optimization"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Importing libraries"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import torch\n",
 24 |     "import torch.nn as nn\n",
 25 |     "import torch.utils.data as data\n",
 26 |     "import numpy as np\n",
 27 |     "from cnn_stacking import autoencoder,autoencoder1\n",
 28 |     "from cnn_stacking import Cnn_Stacking\n",
 29 |     "from dataloader_csv import CustomDatasetFromImages"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "Checking the availability of GPU "
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n",
 46 |     "if torch.cuda.is_available(): \n",
 47 |     "    print(\"gpu available\")"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "This function is used to save the state of the model"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "path_to_checkpoint='...//save_state_cnn_stack//'\n",
 64 |     "check_name='checkpoint.pth.tar'\n",
 65 |     "def save_checkpoint(state, is_best, filename,loss):\n",
 66 |     "    \"\"\"Save checkpoint if a new best is achieved\"\"\"\n",
 67 |     "    if is_best:\n",
 68 |     "        print (\"=> Saving a new lowest loss : \"+str(loss))\n",
 69 |     "        torch.save(state, filename)  # save checkpoint"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "Following function takes 5 parameters. Purpose of this function is to train the encoder layer of autoencoder"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 4,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "def single_auto(model,_data,num_epochs,learning_rate,w_decay):\n",
 86 |     "    \n",
 87 |     "    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n",
 88 |     "    criterion = nn.MSELoss()\n",
 89 |     "\n",
 90 |     "    for epoch in range(num_epochs):\n",
 91 |     "        loss_list=[]\n",
 92 |     "        for images, labels in _data:\n",
 93 |     "\n",
 94 |     "            images = images.to(device)\n",
 95 |     "\n",
 96 |     "            outputs = model(images)\n",
 97 |     "\n",
 98 |     "            loss = criterion(outputs,images)\n",
 99 |     "            optimizer.zero_grad() \n",
100 |     "            \n",
101 |     "            l1_reg = None\n",
102 |     "            for W in model.parameters():\n",
103 |     "                if l1_reg is None:\n",
104 |     "                    l1_reg = W.norm(p=1)\n",
105 |     "                else:\n",
106 |     "                    l1_reg = l1_reg + W.norm(p=1)\n",
107 |     "                                     \n",
108 |     "            loss=loss+l1_reg * w_decay\n",
109 |     "            \n",
110 |     "            loss.backward(retain_graph=True)\n",
111 |     "            \n",
112 |     "            optimizer.step()\n",
113 |     "            loss_list.append(loss.item())\n",
114 |     "        print('epoch [{}/{}], mean_loss:{:.4f}'.format(epoch + 1, num_epochs,np.mean(np.array(loss_list))))\n",
115 |     "    return model"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "The following function takes 6 parameters. This function first pass the data from pretrained encoder layer of an autoencoder and then pass it through the autoencoder1 for the pretraining of its encoder layer"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 5,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "def auto_to_auto(model_e,model_t,_data,num_epochs,learning_rate,w_decay):\n",
132 |     "    optimizer = torch.optim.Adam(model_t.parameters(), lr=learning_rate)\n",
133 |     "    criterion = nn.MSELoss()\n",
134 |     "    for epoch in range(num_epochs):\n",
135 |     "        loss_list=[]\n",
136 |     "        for images, labels in _data:\n",
137 |     "          \n",
138 |     "            images = images.to(device)\n",
139 |     "            \n",
140 |     "            images=model_e.encoder(images)\n",
141 |     "         \n",
142 |     "            outputs = model_t(images)\n",
143 |     "\n",
144 |     "            loss = criterion(outputs,images)\n",
145 |     "            \n",
146 |     "            optimizer.zero_grad() \n",
147 |     "          \n",
148 |     "            \n",
149 |     "            l1_reg = None\n",
150 |     "            for W in model_t.parameters():\n",
151 |     "                if l1_reg is None:\n",
152 |     "                    l1_reg = W.norm(p=1)\n",
153 |     "                else:\n",
154 |     "                    l1_reg = l1_reg + W.norm(p=1)\n",
155 |     "                                     \n",
156 |     "            loss=loss+l1_reg * w_decay\n",
157 |     "            \n",
158 |     "            loss.backward(retain_graph=True)\n",
159 |     "            \n",
160 |     "            optimizer.step()\n",
161 |     "            loss_list.append(loss.item())\n",
162 |     "        print('epoch [{}/{}], mean_loss:{:.4f}'\n",
163 |     "              .format(epoch + 1, num_epochs,np.mean(np.array(loss_list))\n",
164 |     "        ))\n",
165 |     "    return model_t"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "Loading the train and validation data"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 6,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "TRAIN_DATA_PATH = \"...//data_to_tra.csv\"\n",
182 |     "VAL_DATA_PATH=\"...//data_to_val.csv\"\n",
183 |     "\n",
184 |     "\n",
185 |     "train_data=CustomDatasetFromImages(TRAIN_DATA_PATH)\n",
186 |     "train_data_loader = data.DataLoader(train_data,batch_size=20,shuffle=True)\n",
187 |     "\n",
188 |     "\n",
189 |     "val_data=CustomDatasetFromImages(VAL_DATA_PATH)\n",
190 |     "val_data_loader = data.DataLoader(val_data,batch_size=20,shuffle=True)\n"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "Setting the parameters"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 7,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "num_epochs,learning_rate,weight_decay = 100,0.001,1e-5"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "Trainig the autoencoders"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 8,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "auto=single_auto(autoencoder().to(device).float(),train_data_loader,num_epochs,learning_rate,weight_decay)\n",
223 |     "auto1=auto_to_auto(auto,autoencoder1().to(device).float(),train_data_loader,num_epochs,learning_rate,weight_decay)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "Creating the model (stacking the pretrained encoder layer for fine tuning purpose ), defining the loss function and optimizers "
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 10,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "model=Cnn_Stacking(auto,auto1).to(device).float()\n",
240 |     "optimizer = torch.optim.Adam(model.parameters(), lr=0.001,weight_decay=1e-3)\n",
241 |     "criterion = nn.CrossEntropyLoss()"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "In this section, stacked encoder layers are fine tunned"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 11,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "total_step = len(train_data_loader)\n",
258 |     "lowest_loss=0.300\n",
259 |     "\n",
260 |     "for epoch in range(num_epochs):\n",
261 |     "\n",
262 |     "    for i, (images, labels) in enumerate(train_data_loader):\n",
263 |     "        \n",
264 |     "        images = images.to(device)\n",
265 |     "        labels = labels\n",
266 |     "        labels = labels.to(device)\n",
267 |     "\n",
268 |     "        outputs = model(images)\n",
269 |     "\n",
270 |     "        loss = criterion(outputs,labels)\n",
271 |     "\n",
272 |     "        optimizer.zero_grad() \n",
273 |     "        loss.backward()\n",
274 |     "        optimizer.step()\n",
275 |     "\n",
276 |     "        val_loss_list=[]\n",
277 |     "        for y,(val_images,val_labels) in enumerate(val_data_loader):\n",
278 |     "\n",
279 |     "            val_images=val_images.to(device)\n",
280 |     "\n",
281 |     "            val_labels=val_labels\n",
282 |     "            val_labels=val_labels.to(device)\n",
283 |     "\n",
284 |     "            val_outputs=model(val_images)\n",
285 |     "\n",
286 |     "            val_loss = criterion(val_outputs, val_labels)\n",
287 |     "            val_loss_list.append(val_loss.item())\n",
288 |     "        \n",
289 |     "        mean_loss= np.mean(np.array(val_loss_list))\n",
290 |     "\n",
291 |     "        is_best= bool(mean_loss<lowest_loss)\n",
292 |     "        if(is_best):\n",
293 |     "            lowest_loss= min(mean_loss,lowest_loss)\n",
294 |     "            path= path_to_checkpoint+str(lowest_loss)+\" \"+str(epoch+1)+\" \"+check_name\n",
295 |     "            save_checkpoint({'epoch':epoch + 1,'state_dict': model.state_dict(),'lowest_loss': lowest_loss }, is_best,path,lowest_loss)\n",
296 |     "            is_best= False\n",
297 |     "        \n",
298 |     "        print(\"Epoch [\"+str(epoch+1)+\"/\"+str(num_epochs)+\"],Batch_no[\"+str(i+1)+\"/\"+str(total_step)+\"] \"+\"Loss:\"+str(format(loss.item(),'.4f'))+\" Val_loss:\"+str(format(mean_loss,'.4f')))"
299 |    ]
300 |   }
301 |  ],
302 |  "metadata": {
303 |   "kernelspec": {
304 |    "display_name": "Python 3",
305 |    "language": "python",
306 |    "name": "python3"
307 |   },
308 |   "language_info": {
309 |    "codemirror_mode": {
310 |     "name": "ipython",
311 |     "version": 3
312 |    },
313 |    "file_extension": ".py",
314 |    "mimetype": "text/x-python",
315 |    "name": "python",
316 |    "nbconvert_exporter": "python",
317 |    "pygments_lexer": "ipython3",
318 |    "version": "3.7.2"
319 |   }
320 |  },
321 |  "nbformat": 4,
322 |  "nbformat_minor": 2
323 | }
324 | 


--------------------------------------------------------------------------------
/Byte/Feature extractor (CNN-optimization).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Feature extractor (CNN) - optimization"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this code, we optimize the feature extractor (CNN). Following mentioned parameters are optimized one"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Importing libraries"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import torch\n",
 31 |     "import torch.nn as nn\n",
 32 |     "import torch.utils.data as data\n",
 33 |     "import numpy as np\n",
 34 |     "from cnn_v1 import ConvNet_single\n",
 35 |     "from dataloader_csv import CustomDatasetFromImages"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Checking the availability of GPU"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n",
 52 |     "if torch.cuda.is_available(): \n",
 53 |     "    print(\"gpu availible\")"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "Setting the parameters"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 3,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "num_epochs,num_classes,batch_size,learning_rate = 30, 9,20,0.001"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "Loading the train and validation data"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 5,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "train_path='data_to_tra.csv'\n",
 86 |     "val_path= 'data_to_val.csv'\n",
 87 |     "\n",
 88 |     "train_data = CustomDatasetFromImages(train_path)\n",
 89 |     "val_data = CustomDatasetFromImages(val_path)\n",
 90 |     "\n",
 91 |     "val_size = 500\n",
 92 |     "\n",
 93 |     "train_data_loader = data.DataLoader(train_data, batch_size=batch_size,shuffle=True) \n",
 94 |     "val_data_loader  = data.DataLoader(val_data, batch_size=val_size, shuffle=True)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "Creating the model, defining loss function and optimizer"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 6,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "model= ConvNet_single().to(device)\n",
111 |     "criterion = nn.CrossEntropyLoss()\n",
112 |     "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "Function to save the state of the model"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 7,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "path_to_checkpoint='.../check_point/'\n",
129 |     "check_name='checkpoint.pth.tar'\n",
130 |     "\n",
131 |     "def save_checkpoint(state, is_best, filename,loss):\n",
132 |     "    if is_best:\n",
133 |     "        print (\"=> Saving a new lowest loss : \"+str(loss))\n",
134 |     "        torch.save(state, filename)  # save checkpoint"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "This is the section where training takes place"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 8,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "total_step = len(train_data_loader)\n",
151 |     "lowest_loss=0.800\n",
152 |     "\n",
153 |     "for epoch in range(num_epochs):\n",
154 |     "    for i, (images, labels) in enumerate(train_data_loader):\n",
155 |     "        images = images.to(device)\n",
156 |     "        labels = labels.to(device)\n",
157 |     "# Forward pass\n",
158 |     "        outputs = model(images)\n",
159 |     "        _, predicted = torch.max(outputs.data, 1)\n",
160 |     "        loss = criterion(outputs, labels)\n",
161 |     "        x=np.array((predicted==labels).cpu())\n",
162 |     "# Backward and optimize\n",
163 |     "        optimizer.zero_grad() \n",
164 |     "        loss.backward()\n",
165 |     "        optimizer.step()\n",
166 |     "        val_loss_list=[]\n",
167 |     "        val_acc_list =[]\n",
168 |     "# Doing validation  \n",
169 |     "        for y,(images,labels) in enumerate(val_data_loader):\n",
170 |     "            images=images.to(device)\n",
171 |     "            labels=labels.to(device)\n",
172 |     "            outputs=model(images)\n",
173 |     "            _, predicted = torch.max(outputs.data, 1)\n",
174 |     "            val_loss = criterion(outputs, labels)\n",
175 |     "            val_loss_list.append(val_loss.item())\n",
176 |     "            val_x=np.array((predicted==labels).cpu())\n",
177 |     "            val_acc=(sum(val_x))*100/len(val_x)\n",
178 |     "            val_acc_list.append(val_acc)\n",
179 |     "        mean_loss= np.mean(np.array(val_loss_list))\n",
180 |     "        mean_acc = np.mean(np.array(val_acc_list))\n",
181 |     "        is_best= bool(mean_loss<lowest_loss)\n",
182 |     "# If validation results are good then previous loss new state of the model will be save\n",
183 |     "        if(is_best):\n",
184 |     "            lowest_loss= min(mean_loss,lowest_loss)\n",
185 |     "            pathe= path_to_checkpoint+str(lowest_loss)+\" \"+str(epoch+1)+\" \"+check_name\n",
186 |     "            save_checkpoint({'epoch':epoch + 1,'state_dict': model.state_dict(),'lowest_loss': lowest_loss }, is_best,pathe,lowest_loss)\n",
187 |     "            is_best= False\n",
188 |     "\n",
189 |     "        print(\"Epoch [\"+str(epoch+1)+\"/\"+str(num_epochs)+\"],Batch_no[\"+str(i+1)+\"/\"+str(total_step)+\"] \"+\"Loss:\"+str(format(loss.item(),'.4f'))+\" Acc :\"+str(format((sum(x))*100/len(x),'.4f'))+\" Val_loss:\"+str(format(mean_loss,'.4f'))+\" Val_acc:\"+str(format(mean_acc,'.0f')))"
190 |    ]
191 |   }
192 |  ],
193 |  "metadata": {
194 |   "kernelspec": {
195 |    "display_name": "Python 3",
196 |    "language": "python",
197 |    "name": "python3"
198 |   },
199 |   "language_info": {
200 |    "codemirror_mode": {
201 |     "name": "ipython",
202 |     "version": 3
203 |    },
204 |    "file_extension": ".py",
205 |    "mimetype": "text/x-python",
206 |    "name": "python",
207 |    "nbconvert_exporter": "python",
208 |    "pygments_lexer": "ipython3",
209 |    "version": "3.7.2"
210 |   }
211 |  },
212 |  "nbformat": 4,
213 |  "nbformat_minor": 2
214 | }
215 | 


--------------------------------------------------------------------------------
/Byte/Images to Comma-Separated Values(csv).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Images to Comma-Separated Values(csv)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Importing libraries"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from PIL import Image\n",
 24 |     "import numpy as np\n",
 25 |     "import csv"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "Reading IDs and labels from csv"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "malwarelist=np.genfromtxt('trainLabels.csv',delimiter=\",\",dtype=str)\n",
 42 |     "malwarelist=malwarelist[1:]"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "Path to resized images(32x32) "
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 3,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "folder='...\\\\resize images\\\\'"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "Below code read the images and then flatten their values(i.e. 32x32=1024). These values stored in the data_to_write list against their IDs and labels"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 5,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "c_files=['58kxhXouHzFd4g3rmInB','6tfw0xSL2FNHOCJBdlaA','a9oIzfw03ED4lTBCt52Y','cf4nzsoCmudt1kwleOTI','d0iHC6ANYGon7myPFzBe','da3XhOZzQEbKVtLgMYWv','fRLS3aKkijp4GH0Ds6Pv','IidxQvXrlBkWPZAfcqKT']\n",
 75 |     "data_to_write=[]\n",
 76 |     "for name in malwarelist:\n",
 77 |     "    b = \".bytes\"\n",
 78 |     "    nam=name[0].strip('\"')\n",
 79 |     "    if nam not in c_files:\n",
 80 |     "        loc = folder+nam + b\n",
 81 |     "        img = np.asarray(Image.open(folder+nam+\".png\"))\n",
 82 |     "        img = img.flatten()\n",
 83 |     "        img= img.tolist()\n",
 84 |     "        img.append(nam)\n",
 85 |     "        img.append(name[1])\n",
 86 |     "        data_to_write.append(img)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Following commands generate the CSV from the values store in data_to_write list"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 8,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "myFile = open('data_new.csv', 'w', newline='')  \n",
103 |     "with myFile:  \n",
104 |     "    writer = csv.writer(myFile)\n",
105 |     "    writer.writerows(data_to_write)"
106 |    ]
107 |   }
108 |  ],
109 |  "metadata": {
110 |   "kernelspec": {
111 |    "display_name": "Python 3",
112 |    "language": "python",
113 |    "name": "python3"
114 |   },
115 |   "language_info": {
116 |    "codemirror_mode": {
117 |     "name": "ipython",
118 |     "version": 3
119 |    },
120 |    "file_extension": ".py",
121 |    "mimetype": "text/x-python",
122 |    "name": "python",
123 |    "nbconvert_exporter": "python",
124 |    "pygments_lexer": "ipython3",
125 |    "version": "3.7.2"
126 |   }
127 |  },
128 |  "nbformat": 4,
129 |  "nbformat_minor": 2
130 | }
131 | 


--------------------------------------------------------------------------------
/Byte/Min-max normalization(Byte).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Min-max normalization(Byte)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Importing libraries and initializing variables"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 17,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from PIL import Image\n",
 24 |     "import numpy as np\n",
 25 |     "import csv\n",
 26 |     "train_full='data_to_traFull.csv'\n",
 27 |     "test='data_to_test.csv'\n",
 28 |     "train='data_to_tra.csv'\n",
 29 |     "valid='data_to_val.csv'"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "In this section of code, train data is normalized with respect to its max and min values against each feature.  "
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 18,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "new_train_data = np.genfromtxt(train, delimiter=\",\", dtype=str)\n",
 46 |     "tra_data= new_train_data[:,0:-2]\n",
 47 |     "tra_lab=new_train_data[:,-1]\n",
 48 |     "tra_name=new_train_data[:,-2]\n",
 49 |     "tra_lab=tra_lab.astype(np.int)\n",
 50 |     "tra_data=tra_data.astype(np.float)\n",
 51 |     "tra_min=np.min(tra_data,axis=0)\n",
 52 |     "tra_max=np.max(tra_data,axis=0)\n",
 53 |     "tra_dom=np.subtract(tra_max,tra_min)\n",
 54 |     "tra_dom=np.where(tra_dom == 0, 1, tra_dom)\n",
 55 |     "img = np.subtract(tra_data,tra_min)\n",
 56 |     "img = np.divide(img, tra_dom)\n",
 57 |     "img= img.tolist()\n",
 58 |     "\n",
 59 |     "for i in range(len(tra_lab)):\n",
 60 |     "    img[i].append(tra_name[i])\n",
 61 |     "    img[i].append(tra_lab[i])\n",
 62 |     "\n",
 63 |     "myFile = open('data_to_tra.csv', 'w', newline='')  \n",
 64 |     "\n",
 65 |     "with myFile:  \n",
 66 |     "    writer = csv.writer(myFile)\n",
 67 |     "    writer.writerows(img)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "Validation data is normalized with respect to training data max and min values against each feature"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 19,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "new_valid_data = np.genfromtxt(valid, delimiter=\",\", dtype=str)\n",
 84 |     "valid_data= new_valid_data[:,0:-2]\n",
 85 |     "valid_name=new_valid_data[:,-2]\n",
 86 |     "valid_lab=new_valid_data[:,-1]\n",
 87 |     "valid_lab=valid_lab.astype(np.int)\n",
 88 |     "valid_data=valid_data.astype(np.float)\n",
 89 |     "img = np.subtract(valid_data,tra_min)\n",
 90 |     "img = np.divide(img, tra_dom)\n",
 91 |     "img= img.tolist()\n",
 92 |     "\n",
 93 |     "for i in range(len(valid_lab)):\n",
 94 |     "    img[i].append(valid_name[i])\n",
 95 |     "    img[i].append(valid_lab[i])\n",
 96 |     "\n",
 97 |     "\n",
 98 |     "myFile = open('data_to_val.csv', 'w', newline='')  \n",
 99 |     "\n",
100 |     "with myFile:  \n",
101 |     "    writer = csv.writer(myFile)\n",
102 |     "    writer.writerows(img)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "In this section of code trainfull data is normalized with respect to its max and min values against each feature."
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 20,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "new_x_data = np.genfromtxt(train_full, delimiter=\",\", dtype=str)\n",
119 |     "x_data= new_x_data[:,0:-2]\n",
120 |     "x_name=new_x_data[:,-2]\n",
121 |     "x_lab=new_x_data[:,-1]\n",
122 |     "x_lab=x_lab.astype(np.int)\n",
123 |     "x_data=x_data.astype(np.float)\n",
124 |     "x_min=np.min(x_data,axis=0)\n",
125 |     "x_max=np.max(x_data,axis=0)\n",
126 |     "x_dom=np.subtract(x_max,x_min)\n",
127 |     "x_dom=np.where(x_dom == 0, 1, x_dom)\n",
128 |     "img = np.subtract(x_data,x_min)\n",
129 |     "img = np.divide(img, x_dom)\n",
130 |     "img= img.tolist()\n",
131 |     "\n",
132 |     "for i in range(len(x_lab)):\n",
133 |     "    img[i].append(x_name[i])\n",
134 |     "    img[i].append(x_lab[i])\n",
135 |     "\n",
136 |     "\n",
137 |     "myFile = open('data_to_traFull.csv', 'w', newline='')  \n",
138 |     "\n",
139 |     "with myFile:  \n",
140 |     "    writer = csv.writer(myFile)\n",
141 |     "    writer.writerows(img)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "Test data is normalized with respect to trainfull data max and min values against each feature."
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 21,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "new_test_data = np.genfromtxt(test, delimiter=\",\", dtype=str)\n",
158 |     "test_data= new_test_data[:,0:-2]\n",
159 |     "test_lab=new_test_data[:,-1]\n",
160 |     "test_name=new_test_data[:,-2]\n",
161 |     "test_lab=test_lab.astype(np.int)\n",
162 |     "test_data=test_data.astype(np.float)\n",
163 |     "img = np.subtract(test_data,x_min)\n",
164 |     "img = np.divide(img, x_dom)\n",
165 |     "img= img.tolist()\n",
166 |     "\n",
167 |     "for i in range(len(test_lab)):\n",
168 |     "    img[i].append(test_name[i])\n",
169 |     "    img[i].append(test_lab[i])\n",
170 |     "\n",
171 |     "\n",
172 |     "myFile = open('data_to_test.csv', 'w', newline='')  \n",
173 |     "\n",
174 |     "with myFile:  \n",
175 |     "    writer = csv.writer(myFile)\n",
176 |     "    writer.writerows(img)"
177 |    ]
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.7.2"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 2
201 | }
202 | 


--------------------------------------------------------------------------------
/Byte/Read me:
--------------------------------------------------------------------------------
 1 | The squence of these tutorials is given below:
 2 | 
 3 | 
 4 | 1. Byte file to image Coversion
 5 | 
 6 | 2. Resizing images to 32x32
 7 | 
 8 | 3. Images to Comma-Separated Values(csv)
 9 | 
10 | 4. Stratified sampling
11 | 
12 | 5. Min-max normalization(Byte)
13 | 
14 | 6. Feature extractor(CNN-optimization)
15 | 
16 | 7. Feature extractor (CNN with pretrained encoder layers) - optimization
17 | 
18 | 
19 | The preprocessed data of Byte files plus the data use in these tutorial can be found on the Byte folder of following drive link:
20 | 
21 | https://drive.google.com/drive/folders/1s7EC4s_-hP9q5vEhs-3vAubspcZbBADK?usp=sharing
22 | 


--------------------------------------------------------------------------------
/Byte/Resizing images to 32x32.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Resizing images to 32x32"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Importing libraries"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from PIL import Image\n",
 24 |     "import numpy as np"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "Reading data from CSV"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "malwarelist=np.genfromtxt('trainLabels.csv',delimiter=\",\",dtype=str)\n",
 41 |     "malwarelist=malwarelist[1:]"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "Setting paths"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 3,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "destination='....\\\\resize images\\\\'\n",
 58 |     "folder='....\\\\images\\\\'"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "Below code reads actual images and resize them to 32x32 using BICUBIC interpolation, whereas corrupted file names stored in the c_files list variable"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "c_files=['58kxhXouHzFd4g3rmInB','6tfw0xSL2FNHOCJBdlaA','a9oIzfw03ED4lTBCt52Y','cf4nzsoCmudt1kwleOTI','d0iHC6ANYGon7myPFzBe','da3XhOZzQEbKVtLgMYWv','fRLS3aKkijp4GH0Ds6Pv','IidxQvXrlBkWPZAfcqKT']\n",
 75 |     "for name in malwarelist:\n",
 76 |     "    b = \".bytes\"\n",
 77 |     "    nam=name[0].strip('\"')\n",
 78 |     "    if nam not in c_files:\n",
 79 |     "        loc = folder+nam + b\n",
 80 |     "        img=Image.open(folder+nam+\".png\")\n",
 81 |     "        img = img.resize((32, 32), Image.BICUBIC)\n",
 82 |     "        img.save(destination+nam+\".png\")"
 83 |    ]
 84 |   }
 85 |  ],
 86 |  "metadata": {
 87 |   "kernelspec": {
 88 |    "display_name": "Python 3",
 89 |    "language": "python",
 90 |    "name": "python3"
 91 |   },
 92 |   "language_info": {
 93 |    "codemirror_mode": {
 94 |     "name": "ipython",
 95 |     "version": 3
 96 |    },
 97 |    "file_extension": ".py",
 98 |    "mimetype": "text/x-python",
 99 |    "name": "python",
100 |    "nbconvert_exporter": "python",
101 |    "pygments_lexer": "ipython3",
102 |    "version": "3.7.2"
103 |   }
104 |  },
105 |  "nbformat": 4,
106 |  "nbformat_minor": 2
107 | }
108 | 


--------------------------------------------------------------------------------
/Byte/Stratified sampling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Stratified sampling"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Importing libraries"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 7,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import numpy\n",
 24 |     "import random\n",
 25 |     "import csv\n",
 26 |     "from math import floor"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "The stratified_data takes three parameters, target_file, new_file(name of the file where to store new stratified data) and splitting rate. In Section-1 of the following code data, name and labels are read from CSV (numpy array format) and converted to list for further processing. As samples are in cluster form, so for the purpose of splitting, started index of each group is stored in Section-2. Random but stratified indexes of samples are pick from each group and store in the stratified_sample list variable in Section-3. In Section-4, against each stored index in stratified_sample list data of samples along with their name and labels store in data_to_write list for writing it to the CSV"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 8,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "def stratified_data(target_file,new_file,spliting_rate=0.2):\n",
 43 |     "\n",
 44 |     "#Section-1\n",
 45 |     "    trainlabel = numpy.genfromtxt(target_file, delimiter=\",\", dtype=str)\n",
 46 |     "    FileName=trainlabel[0:,-2]\n",
 47 |     "    dat=trainlabel[0:,0:-2]\n",
 48 |     "    dat=dat.astype(numpy.float)\n",
 49 |     "    Label=trainlabel[0:, -1]\n",
 50 |     "    Label=Label.astype(numpy.int)    \n",
 51 |     "    dat=dat.tolist()\n",
 52 |     "    FileName = FileName.tolist()\n",
 53 |     "    Label=Label.tolist()\n",
 54 |     "\n",
 55 |     "#Section-2\n",
 56 |     "    record=[]\n",
 57 |     "    b=[]\n",
 58 |     "    count=0\n",
 59 |     "    temp=[]\n",
 60 |     "    starting_index=0\n",
 61 |     "\n",
 62 |     "    for i in Label:\n",
 63 |     "        if i not in b:\n",
 64 |     "            if starting_index!=0:\n",
 65 |     "                temp.append(count)\n",
 66 |     "                record.append(temp)\n",
 67 |     "            #     print(temp)\n",
 68 |     "                temp=[]\n",
 69 |     "                count=0\n",
 70 |     "            b.append(i)\n",
 71 |     "            temp.append(i)\n",
 72 |     "            temp.append(starting_index)\n",
 73 |     "        count+=1\n",
 74 |     "        starting_index += 1\n",
 75 |     "    temp.append(count)\n",
 76 |     "    record.append(temp)\n",
 77 |     "    \n",
 78 |     "#Section-3\n",
 79 |     "    stratified_sample=[]\n",
 80 |     "    for i in record:\n",
 81 |     "        count=0\n",
 82 |     "        temp=[]\n",
 83 |     "        while count<=floor(i[2]*spliting_rate):\n",
 84 |     "            a=random.randint(i[1],(i[1]+i[2]-1))\n",
 85 |     "            if a not in temp:\n",
 86 |     "                #      print(a)\n",
 87 |     "                count+=1\n",
 88 |     "                temp.append(a)\n",
 89 |     "        stratified_sample.append(temp)\n",
 90 |     "#Section-4\n",
 91 |     "    data_to_write=[]\n",
 92 |     "    count=0\n",
 93 |     "    c=0\n",
 94 |     "    for i in stratified_sample:\n",
 95 |     "        c+=1\n",
 96 |     "        print('Class ',c,' samples = ',len(i))\n",
 97 |     "        count+=len(i)\n",
 98 |     "        for j in i:\n",
 99 |     "            a=dat[j]\n",
100 |     "            a.append(FileName[j].strip('\"'))\n",
101 |     "            a.append(Label[j])\n",
102 |     "            data_to_write.append(a)\n",
103 |     "    print('Total samples ',count)\n",
104 |     "\n",
105 |     "    myFile = open(new_file, 'w', newline='')  \n",
106 |     "    with myFile:  \n",
107 |     "        writer = csv.writer(myFile)\n",
108 |     "        writer.writerows(data_to_write)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "The train_validation_split function do the actual work of splitting using the stratified_data function, it takes the name of target CSV and generates two CSV, train and validation without affecting the original CSV"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 9,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "def train_validation_split(data_csv,train_csv,validation_csv,training_data_rate=0.5):\n",
125 |     "    \n",
126 |     "    data_to_write=[]\n",
127 |     "    to_list=[]\n",
128 |     "    \n",
129 |     "    stratified_data(data_csv,train_csv,training_data_rate)\n",
130 |     "    \n",
131 |     "    new_train_data = numpy.genfromtxt(train_csv, delimiter=\",\", dtype=str)\n",
132 |     "    new_whole_data = numpy.genfromtxt(data_csv, delimiter=\",\", dtype=str)\n",
133 |     "    \n",
134 |     "    for i in new_train_data:\n",
135 |     "    \n",
136 |     "        to_list.append(i[-2])\n",
137 |     "    \n",
138 |     "    for i in new_whole_data:\n",
139 |     "        a=i[-2].strip('\"')\n",
140 |     "        if a not in to_list:\n",
141 |     "        \n",
142 |     "            data_to_write.append(i)\n",
143 |     "    myFile = open(validation_csv, 'w', newline='')  \n",
144 |     "    \n",
145 |     "    with myFile:  \n",
146 |     "        writer = csv.writer(myFile)\n",
147 |     "        writer.writerows(data_to_write)"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "Demo to use train_validation_split function"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 10,
160 |    "metadata": {},
161 |    "outputs": [
162 |     {
163 |      "name": "stdout",
164 |      "output_type": "stream",
165 |      "text": [
166 |       "No. of samples in different classes that store in data_to_traFull according to given rate\n",
167 |       "\n",
168 |       "Class  1  samples =  1150\n",
169 |       "Class  2  samples =  1859\n",
170 |       "Class  3  samples =  2207\n",
171 |       "Class  4  samples =  357\n",
172 |       "Class  5  samples =  32\n",
173 |       "Class  6  samples =  564\n",
174 |       "Class  7  samples =  299\n",
175 |       "Class  8  samples =  922\n",
176 |       "Class  9  samples =  760\n",
177 |       "Total samples  8150\n",
178 |       "No. of samples in different classes that store in data_to_tra according to given rate\n",
179 |       "\n",
180 |       "Class  1  samples =  863\n",
181 |       "Class  2  samples =  1395\n",
182 |       "Class  3  samples =  1656\n",
183 |       "Class  4  samples =  268\n",
184 |       "Class  5  samples =  25\n",
185 |       "Class  6  samples =  424\n",
186 |       "Class  7  samples =  225\n",
187 |       "Class  8  samples =  692\n",
188 |       "Class  9  samples =  571\n",
189 |       "Total samples  6119\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "data='data_new.csv'\n",
195 |     "\n",
196 |     "train_full='data_to_traFull.csv'\n",
197 |     "test='data_to_test.csv'\n",
198 |     "train='data_to_tra.csv'\n",
199 |     "valid='data_to_val.csv'\n",
200 |     "\n",
201 |     "#0.75 means 75% data will stores in train_full and 25% will be in test  \n",
202 |     "print('No. of samples in different classes that store in data_to_traFull according to given rate\\n')\n",
203 |     "train_validation_split(data,train_full,test,0.75)\n",
204 |     "print('No. of samples in different classes that store in data_to_tra according to given rate\\n')\n",
205 |     "train_validation_split(train_full,train,valid,0.75)"
206 |    ]
207 |   }
208 |  ],
209 |  "metadata": {
210 |   "kernelspec": {
211 |    "display_name": "Python 3",
212 |    "language": "python",
213 |    "name": "python3"
214 |   },
215 |   "language_info": {
216 |    "codemirror_mode": {
217 |     "name": "ipython",
218 |     "version": 3
219 |    },
220 |    "file_extension": ".py",
221 |    "mimetype": "text/x-python",
222 |    "name": "python",
223 |    "nbconvert_exporter": "python",
224 |    "pygments_lexer": "ipython3",
225 |    "version": "3.7.2"
226 |   }
227 |  },
228 |  "nbformat": 4,
229 |  "nbformat_minor": 2
230 | }
231 | 


--------------------------------------------------------------------------------
/Byte/cnn_stacking.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Feb 25 09:59:45 2019
 5 | 
 6 | @author: PR-LAB
 7 | """
 8 | 
 9 | 
10 | 
11 | import torch.nn as nn
12 | 
13 | class Cnn_Stacking(nn.Module):
14 |     def __init__(self,mod,mod1):
15 |         super(Cnn_Stacking, self).__init__()
16 |         self.feature = mod.encoder
17 |         self.feature1 = mod1.encoder
18 |         self.feature2 = nn.Sequential(nn.Linear(256*8 * 8, 500),nn.ReLU(True))
19 |         self.feature3 = nn.Sequential(nn.Linear(500, 9),nn.ReLU(True))
20 |         
21 |     def forward(self, x):
22 |         x = self.feature(x)
23 |         x = self.feature1(x)
24 |         x = x.reshape(x.size(0), -1)
25 |         x = self.feature2(x)
26 |         x = self.feature3(x)
27 | #        x = self.classifier(x)
28 |         return x
29 | 
30 | 
31 | class autoencoder(nn.Module):
32 |     def __init__(self):
33 |         super(autoencoder, self).__init__()
34 |         self.encoder = nn.Sequential(
35 |             nn.Conv2d(1, 128, 3, stride=1, padding=1),  # b, 16, 10, 10
36 |             nn.ReLU(True),
37 |             nn.MaxPool2d(kernel_size=2,stride=2)  # b, 16, 5, 5
38 |        )
39 |         self.decoder = nn.Sequential(
40 |             nn.ConvTranspose2d(128, 1, 2, stride=2),  # b, 16, 5, 5
41 |             nn.ReLU(True)
42 |         )
43 | 
44 | 
45 |     def forward(self, x):
46 |         x = self.encoder(x)
47 |         x = self.decoder(x)
48 |         return x
49 | 
50 | class autoencoder1(nn.Module):
51 |     def __init__(self):
52 |         super(autoencoder1, self).__init__()
53 |         self.encoder = nn.Sequential(
54 |             nn.Conv2d(128, 256, 3, stride=1, padding=1),  # b, 16, 10, 10
55 |             nn.ReLU(True),
56 |             nn.MaxPool2d(kernel_size=2,stride=2)  # b, 16, 5, 5
57 |         )
58 |         self.decoder = nn.Sequential(
59 |             nn.ConvTranspose2d(256, 128, 2, stride=2),  # b, 16, 5, 5
60 |             nn.ReLU(True)
61 |         )
62 | 
63 | 
64 |     def forward(self, x):
65 |         x = self.encoder(x)
66 |         x = self.decoder(x)
67 |         return x
68 | 
69 | 


--------------------------------------------------------------------------------
/Byte/cnn_v1.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | class ConvNet_single(nn.Module):
 4 |     def __init__(self):
 5 |         super(ConvNet_single, self).__init__()
 6 |         self.layer1 = nn.Sequential(
 7 |             nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
 8 |             nn.LeakyReLU(),
 9 |             nn.MaxPool2d(kernel_size=2))
10 |         
11 |         self.l1_batchnorm=nn.BatchNorm2d(64)
12 |         
13 |         self.layer2 = nn.Sequential(
14 |             nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
15 |             nn.LeakyReLU(),
16 |             nn.MaxPool2d(kernel_size=2))
17 |         
18 |         self.l2_batchnorm=nn.BatchNorm2d(128)
19 |         
20 |         self.layer3 = nn.Sequential(
21 |             nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
22 |             nn.LeakyReLU(),
23 |             nn.MaxPool2d(kernel_size=2))
24 |         
25 |         self.l3_batchnorm=nn.BatchNorm2d(256)
26 |         
27 |         
28 |         self.layer4 = nn.Sequential(
29 |             nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
30 |             nn.LeakyReLU(),
31 |             nn.MaxPool2d(kernel_size=2))
32 |         
33 |         self.l4_batchnorm=nn.BatchNorm2d(512)
34 |         
35 |         self.layer5 = nn.Sequential(
36 |             nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),
37 |             nn.LeakyReLU())
38 |         
39 |         self.l5_batchnorm=nn.BatchNorm2d(1024)
40 |         
41 |         self.leakyrelu=nn.LeakyReLU()
42 |         
43 |         self.fc = nn.Linear(2*2*1024,1000)
44 |         self.fc01= nn.Linear(1000,500)
45 |         self.fc1= nn.Linear(500,9)
46 | 
47 |     def forward(self, x):
48 |         out = self.layer1(x)
49 |         out = self.l1_batchnorm(out)
50 |         out = self.layer2(out)
51 |         out = self.l2_batchnorm(out)
52 |         out=  self.layer3(out)
53 |         out = self.l3_batchnorm(out)
54 |         out=  self.layer4(out)
55 |         out = self.l4_batchnorm(out)
56 |         out=  self.layer5(out)
57 |         out = self.l5_batchnorm(out)
58 |         out = out.reshape(out.size(0), -1)
59 |         out = self.fc(out)
60 |         out=self.leakyrelu(out)
61 |         out= self.fc01(out)
62 |         out=self.leakyrelu(out)
63 |         out= self.fc1(out)
64 |         return out
65 | 


--------------------------------------------------------------------------------
/Byte/dataloader_csv.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data.dataset import Dataset
 2 | from torchvision import transforms
 3 | import pandas as pd
 4 | import numpy as np
 5 | import torch
 6 | from PIL import Image
 7 | class CustomDatasetFromImages(Dataset):
 8 |     def __init__(self, csv_path):
 9 |         # Transforms
10 |         self.to_tensor = transforms.ToTensor()
11 |         # Read the csv file
12 |         self.data_info = pd.read_csv(csv_path, header=None)
13 |         
14 |         # First column contains the image paths
15 |         self.image_arr = np.asarray(self.data_info.iloc[:, 0:-2])
16 |         # Second column is the labels
17 |         self.label_arr = np.asarray(self.data_info.iloc[:, -1])-1
18 |         # Third column is for an operation indicator
19 |         self.operation_arr = np.asarray(self.data_info.iloc[:, -2])
20 |         # Calculate len
21 |         self.data_len = len(self.data_info.index)
22 |     def __getitem__(self, index):
23 |         # Get image name from the pandas df
24 |         single_image_name = self.image_arr[index]
25 | 
26 |         single_image_name = np.resize(single_image_name,(32,32))
27 |         img_as_tensor =torch.Tensor(single_image_name)
28 |         img_as_tensor = torch.unsqueeze(img_as_tensor,0)
29 |         single_image_label = self.label_arr[index]
30 |         return (img_as_tensor, single_image_label)
31 | 
32 |     def __len__(self):
33 |         return self.data_len
34 | 


--------------------------------------------------------------------------------
/Hybrid (Final)/ANN-Results.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# ANN-Results"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Final results of the proposed methodology obtain by running the following code. It is executed 10 ten times to check the stability of the model(you can also use a loop)."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Importing libraries"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "from final_hybrid_csv import CSV_READER\n",
 31 |     "from ann_hybrid import ANN\n",
 32 |     "import torch\n",
 33 |     "import torch.nn as nn\n",
 34 |     "import torch.utils.data as data\n",
 35 |     "import numpy as np\n",
 36 |     "from sklearn.metrics import confusion_matrix\n",
 37 |     "from sklearn.preprocessing import label_binarize"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "Checking the availablity of the GPU"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 2,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n",
 54 |     "if torch.cuda.is_available(): \n",
 55 |     "    print(\"gpu available\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "Setting the optimize parameters that we got in the optimization phase"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 3,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "num_epochs,num_classes,batch_size,learning_rate,w_decay = 10, 9,50,0.001,0.001"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "Reading the train and test hybrid data"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 5,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "train_path='n_final_hybrid_full_train.csv'\n",
 88 |     "test_path='n_final_hybrid_test.csv'\n",
 89 |     "\n",
 90 |     "train_data = CSV_READER(train_path)\n",
 91 |     "test_data = CSV_READER(test_path)\n",
 92 |     "test_size = len(test_data)\n",
 93 |     "\n",
 94 |     "train_data_loader = data.DataLoader(train_data, batch_size=batch_size,shuffle=True)\n",
 95 |     "test_data_loader  = data.DataLoader(test_data, batch_size=test_size, shuffle=True) "
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Creating the model, defining the loss and optimizer"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 6,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "model=ANN().to(device)\n",
112 |     "criterion = nn.CrossEntropyLoss()\n",
113 |     "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,weight_decay=w_decay)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "Training a model to the defined number of epochs"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 7,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "for epoch in range(num_epochs):\n",
130 |     "\n",
131 |     "    for i, (images,labels) in enumerate(train_data_loader):\n",
132 |     "        images = images.to(device)\n",
133 |     "        labels = labels.to(device)\n",
134 |     "        outputs = model(images)\n",
135 |     "        _, predicted = torch.max(outputs[:,0].data, 1)\n",
136 |     "        loss = criterion(outputs[:,0], labels)\n",
137 |     "        x=np.array((predicted==labels).cpu())\n",
138 |     "        optimizer.zero_grad() \n",
139 |     "        loss.backward()\n",
140 |     "        optimizer.step()"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "Doing the test phase and printing loss and accuracy"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 8,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "for y,(images,labels) in enumerate(test_data_loader):\n",
157 |     "    images=images.to(device)\n",
158 |     "    labels=labels.to(device)\n",
159 |     "    outputs=model(images)\n",
160 |     "    _, predicted = torch.max(outputs[:,0].data, 1)\n",
161 |     "    test_loss = criterion(outputs[:,0], labels)\n",
162 |     "    test_x=np.array((predicted==labels).cpu())\n",
163 |     "    print(\"Test_loss=\"+str(test_loss.item())+\" Test_acc=\"+str((sum(test_x))*100/len(test_x)))"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "Generating the confusion matrix"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 9,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "test_labels_bin=label_binarize(labels.cpu(),classes=[0,1,2,3,4,5,6,7,8])\n",
180 |     "cm=confusion_matrix(labels.cpu(),predicted.cpu())\n",
181 |     "print(cm)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "Note:- For doing the optimization of ANN we use the following code:\n",
189 |     "\n",
190 |     "\n",
191 |     "\n",
192 |     "\n",
193 |     "from final_hybrid_csv import CSV_READER\n",
194 |     "from ann_hybrid import ANN\n",
195 |     "import torch\n",
196 |     "import torch.nn as nn\n",
197 |     "import torch.utils.data as data\n",
198 |     "import numpy as np\n",
199 |     "\n",
200 |     "\n",
201 |     "\n",
202 |     "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n",
203 |     "if torch.cuda.is_available(): \n",
204 |     "    print(\"gpu availible\")\n",
205 |     "\n",
206 |     "\n",
207 |     "num_epochs,num_classes,batch_size,learning_rate = 10, 9,50,0.001\n",
208 |     "\n",
209 |     "\n",
210 |     "train_path='n_final_hybrid_train.csv'\n",
211 |     "val_path='n_final_hybrid_valid.csv'\n",
212 |     "\n",
213 |     "\n",
214 |     "\n",
215 |     "\n",
216 |     "train_data = CSV_READER(train_path)\n",
217 |     "val_data = CSV_READER(val_path)\n",
218 |     "val_size = len(val_data)\n",
219 |     "\n",
220 |     "train_data_loader = data.DataLoader(train_data, batch_size=batch_size,shuffle=True)\n",
221 |     "val_data_loader  = data.DataLoader(val_data, batch_size=val_size, shuffle=True) \n",
222 |     "\n",
223 |     "model=ANN().to(device)\n",
224 |     "\n",
225 |     "\n",
226 |     "criterion = nn.CrossEntropyLoss()\n",
227 |     "\n",
228 |     "path_to_checkpoint='.../_optimize/'\n",
229 |     "\n",
230 |     "check_name='checkpoint.pth.tar'\n",
231 |     "\n",
232 |     "\n",
233 |     "\n",
234 |     "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,weight_decay=0.001)\n",
235 |     "\n",
236 |     "\n",
237 |     "def save_checkpoint(state, is_best, filename,loss):\n",
238 |     "    if is_best:\n",
239 |     "        print (\"=> Saving a new lowest loss : \"+str(loss))\n",
240 |     "        torch.save(state, filename)  # save checkpoint\n",
241 |     "\n",
242 |     "\n",
243 |     "total_step = len(train_data_loader)\n",
244 |     "lowest_loss=0.200\n",
245 |     "checkpoint_step= int(total_step/8)\n",
246 |     "\n",
247 |     "\n",
248 |     "for epoch in range(num_epochs):\n",
249 |     "\n",
250 |     "    for i, (images,labels) in enumerate(train_data_loader):\n",
251 |     "        images = images.to(device)\n",
252 |     "        labels = labels.to(device)\n",
253 |     "    \n",
254 |     "        # Forward pass\n",
255 |     "        outputs = model(images)\n",
256 |     "\n",
257 |     "        _, predicted = torch.max(outputs[:,0].data, 1)\n",
258 |     "    \n",
259 |     "        loss = criterion(outputs[:,0], labels)\n",
260 |     " \n",
261 |     "        x=np.array((predicted==labels).cpu())\n",
262 |     "        # Backward and optimize\n",
263 |     "        optimizer.zero_grad() \n",
264 |     "        loss.backward()\n",
265 |     "        optimizer.step()\n",
266 |     "        val_loss_list=[]\n",
267 |     "        val_acc_list =[]\n",
268 |     "        \n",
269 |     "        for y,(images,labels) in enumerate(val_data_loader):\n",
270 |     "            images=images.to(device)\n",
271 |     "            labels=labels.to(device)\n",
272 |     "            outputs=model(images)\n",
273 |     "            _, predicted = torch.max(outputs[:,0].data, 1)\n",
274 |     "            val_loss = criterion(outputs[:,0], labels)\n",
275 |     "            val_loss_list.append(val_loss.item())\n",
276 |     "            \n",
277 |     "            val_x=np.array((predicted==labels).cpu())\n",
278 |     "            val_acc=(sum(val_x))*100/len(val_x)\n",
279 |     "            val_acc_list.append(val_acc)\n",
280 |     "        mean_loss= np.mean(np.array(val_loss_list))\n",
281 |     "        mean_acc = np.mean(np.array(val_acc_list))\n",
282 |     "        is_best= bool(mean_loss<lowest_loss)\n",
283 |     "        \n",
284 |     "        if(is_best):\n",
285 |     "            lowest_loss= min(mean_loss,lowest_loss)\n",
286 |     "            pathe= path_to_checkpoint+str(lowest_loss)+\" \"+str(epoch+1)+\" \"+check_name\n",
287 |     "            save_checkpoint({'epoch':epoch + 1,'state_dict': model.state_dict(),'lowest_loss': lowest_loss }, is_best,pathe,lowest_loss)\n",
288 |     "            is_best= False\n",
289 |     " \n",
290 |     "        print(\"Epoch [\"+str(epoch+1)+\"/\"+str(num_epochs)+\"],Batch_no[\"+str(i+1)+\"/\"+str(total_step)+\"] \"+\"Loss:\"+str(format(loss.item(),'.4f'))+\" Acc :\"+str(format((sum(x))*100/len(x),'.4f'))+\" Val_loss:\"+str(format(mean_loss,'.4f'))+\" Val_acc:\"+str(format(mean_acc,'.0f')))\n",
291 |     "        "
292 |    ]
293 |   }
294 |  ],
295 |  "metadata": {
296 |   "kernelspec": {
297 |    "display_name": "Python 3",
298 |    "language": "python",
299 |    "name": "python3"
300 |   },
301 |   "language_info": {
302 |    "codemirror_mode": {
303 |     "name": "ipython",
304 |     "version": 3
305 |    },
306 |    "file_extension": ".py",
307 |    "mimetype": "text/x-python",
308 |    "name": "python",
309 |    "nbconvert_exporter": "python",
310 |    "pygments_lexer": "ipython3",
311 |    "version": "3.7.2"
312 |   }
313 |  },
314 |  "nbformat": 4,
315 |  "nbformat_minor": 2
316 | }
317 | 


--------------------------------------------------------------------------------
/Hybrid (Final)/Creating hybrid dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Creating hybrid dataset"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this file, a hybrid dataset is created from the byte file and ASM file. Byte file features are extracted using two models, one is a simple CNN and another one is CNN with pretrained encoder layers. All byte file data pass through both of these pretrained models for feature extraction and then concatenated with the ASM file feature which was selected using SVM classifier "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Importing libraries and initailizing variables"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import torch\n",
 31 |     "import torch.utils.data as data\n",
 32 |     "import numpy as np\n",
 33 |     "import os\n",
 34 |     "from cnn_v1 import ConvNet_single\n",
 35 |     "import torch.nn.functional as F\n",
 36 |     "from dataloader_csv import CustomDatasetFromImages\n",
 37 |     "import csv\n",
 38 |     "from cnn_stacking import autoencoder,autoencoder1\n",
 39 |     "from cnn_stacking import Cnn_Stacking"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "As pretrained models are store in Cuda version, so for loading them GPU must be available "
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n",
 56 |     "if torch.cuda.is_available(): \n",
 57 |     "    print(\"gpu available\")"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "Loading byte File data"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "test_path=  'data_to_test.csv'\n",
 74 |     "\n",
 75 |     "test_data = CustomDatasetFromImages(test_path)\n",
 76 |     "\n",
 77 |     "test_data_loader  = data.DataLoader(test_data, batch_size=1, shuffle=False) \n",
 78 |     "\n",
 79 |     "train_path=  'data_to_tra.csv'\n",
 80 |     "\n",
 81 |     "train_data = CustomDatasetFromImages(train_path)\n",
 82 |     "\n",
 83 |     "train_data_loader  = data.DataLoader(train_data, batch_size=1, shuffle=False) \n",
 84 |     "\n",
 85 |     "valid_path=  'data_to_val.csv'\n",
 86 |     "\n",
 87 |     "valid_data = CustomDatasetFromImages(valid_path)\n",
 88 |     "\n",
 89 |     "valid_data_loader  = data.DataLoader(valid_data, batch_size=1, shuffle=False) \n",
 90 |     "\n",
 91 |     "fulL_train_path=  'data_to_traFull.csv'\n",
 92 |     "\n",
 93 |     "fulL_train_data = CustomDatasetFromImages(fulL_train_path)\n",
 94 |     "\n",
 95 |     "fulL_train_data_loader  = data.DataLoader(fulL_train_data, batch_size=1, shuffle=False) \n"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Creating the First model, which will extract features from byte files"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 5,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "model= ConvNet_single().to(device)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "Loading the best model state during training"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 6,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "path_to_checkpoint = '..../saved_model_state/'\n",
128 |     "\n",
129 |     "scores=[]\n",
130 |     "List= os.listdir(path_to_checkpoint)\n",
131 |     "for i in range(len(List)):\n",
132 |     "    scores.append(float(List[i].split()[0]))\n",
133 |     "max_idx= np.argmin(scores)\n",
134 |     "min_loss_path= path_to_checkpoint+List[max_idx]\n",
135 |     "if (len(List)!=0):        \n",
136 |     "    checkpoint= torch.load(min_loss_path)\n",
137 |     "    model.load_state_dict(checkpoint['state_dict'])\n",
138 |     "    model.eval()\n",
139 |     "    print(\"model initialized\")"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "Loading the ASM file data(opcode frequencies)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 7,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "path_tr='n_500_data_to_tra.csv'\n",
156 |     "path_futr='n_500_data_to_traFull.csv'\n",
157 |     "path_te='n_500_data_to_test.csv'\n",
158 |     "path_val='n_500_data_to_val.csv'\n",
159 |     "\n",
160 |     "train = np.genfromtxt(path_tr, delimiter=\",\", dtype=str)\n",
161 |     "\n",
162 |     "val = np.genfromtxt(path_val, delimiter=\",\", dtype=str)\n",
163 |     "\n",
164 |     "test = np.genfromtxt(path_te, delimiter=\",\", dtype=str)\n",
165 |     "\n",
166 |     "full_train = np.genfromtxt(path_futr, delimiter=\",\", dtype=str)\n",
167 |     "\n",
168 |     "train=train[1:,:]\n",
169 |     "\n",
170 |     "val=val[1:,:]\n",
171 |     "\n",
172 |     "test=test[1:,:]\n",
173 |     "\n",
174 |     "full_train=full_train[1:,:]\n"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "Following piece of code extracts the feature from test data of byte file and concatenate them to top 116 features selected by SVM"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 8,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "data_to_write_test=[]\n",
191 |     "\n",
192 |     "for image,name,label in test_data_loader:\n",
193 |     "    image=image.to(device)\n",
194 |     "\n",
195 |     "    output=model(image)\n",
196 |     "    output=F.softmax(output)\n",
197 |     "    output = output.reshape(output.size(0), -1)\n",
198 |     "    \n",
199 |     "    output=output.to('cpu')\n",
200 |     "    output=output.detach().numpy()\n",
201 |     "    output=output.tolist()\n",
202 |     "    output=output[0]\n",
203 |     "    for i in test:\n",
204 |     "        if i[-2]==name[0]:\n",
205 |     "            print(i[-2],name[0])\n",
206 |     "            for j in i[0:116]:\n",
207 |     "                output.append(j)\n",
208 |     "            output.append(i[-2])\n",
209 |     "            output.append(i[-1])\n",
210 |     "            \n",
211 |     "    data_to_write_test.append(output)\n",
212 |     "\n",
213 |     "myFile = open('final_hybrid_test.csv', 'w', newline='')  \n",
214 |     "\n",
215 |     "with myFile:  \n",
216 |     "    writer = csv.writer(myFile)\n",
217 |     "    writer.writerows(data_to_write_test)\n"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "Following piece of code extracts the feature from train data of byte file and concatenate them to top 116 features selected by SVM"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 9,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "data_to_write_train=[]\n",
234 |     "\n",
235 |     "for image,name,label in train_data_loader:\n",
236 |     "    image=image.to(device)\n",
237 |     "    output=model(image)\n",
238 |     "    output=F.softmax(output) \n",
239 |     "    output = output.reshape(output.size(0), -1)\n",
240 |     "    \n",
241 |     "    output=output.to('cpu')\n",
242 |     "    output=output.detach().numpy()\n",
243 |     "    output=output.tolist()\n",
244 |     "    output=output[0]\n",
245 |     "    for i in train:\n",
246 |     "        if i[-2]==name[0]:\n",
247 |     "            print(i[-2],name[0])\n",
248 |     "            for j in i[0:116]:\n",
249 |     "                output.append(j)\n",
250 |     "            output.append(i[-2])\n",
251 |     "            output.append(i[-1])\n",
252 |     "            \n",
253 |     "    data_to_write_train.append(output)\n",
254 |     "\n",
255 |     "myFile = open('final_hybrid_train.csv', 'w', newline='')  \n",
256 |     "\n",
257 |     "with myFile:  \n",
258 |     "    writer = csv.writer(myFile)\n",
259 |     "    writer.writerows(data_to_write_train)\n"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "Following piece of code extracts the feature from validation data of byte file and concatenate them to top 116 features selected by SVM"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 10,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "data_to_write_val=[]\n",
276 |     "\n",
277 |     "for image,name,label in valid_data_loader:\n",
278 |     "    image=image.to(device)\n",
279 |     "    output=model(image)\n",
280 |     "    output=F.softmax(output)\n",
281 |     "    \n",
282 |     "    output = output.reshape(output.size(0), -1)\n",
283 |     "    \n",
284 |     "    output=output.to('cpu')\n",
285 |     "    output=output.detach().numpy()\n",
286 |     "    output=output.tolist()\n",
287 |     "    output=output[0]\n",
288 |     "    for i in val:\n",
289 |     "        if i[-2]==name[0]:\n",
290 |     "            print(i[-2],name[0])\n",
291 |     "            for j in i[0:116]:\n",
292 |     "                output.append(j)\n",
293 |     "            output.append(i[-2])\n",
294 |     "            output.append(i[-1])\n",
295 |     "            \n",
296 |     "    data_to_write_val.append(output)\n",
297 |     "myFile = open('final_hybrid_valid.csv', 'w', newline='')  \n",
298 |     "\n",
299 |     "with myFile:  \n",
300 |     "    writer = csv.writer(myFile)\n",
301 |     "    writer.writerows(data_to_write_val)"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "metadata": {},
307 |    "source": [
308 |     "Following piece of code extracts the feature from overall train data of byte file and concatenate them to top 116 features selected by SVM "
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 11,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": [
317 |     "data_to_write_full_train=[]\n",
318 |     "\n",
319 |     "for image,name,label in fulL_train_data_loader:\n",
320 |     "    image=image.to(device)\n",
321 |     "    output=model(image)\n",
322 |     "    output=F.softmax(output)\n",
323 |     "    \n",
324 |     "    output = output.reshape(output.size(0), -1)\n",
325 |     "    \n",
326 |     "    output=output.to('cpu')\n",
327 |     "    output=output.detach().numpy()\n",
328 |     "    output=output.tolist()\n",
329 |     "    output=output[0]\n",
330 |     "    for i in full_train:\n",
331 |     "        if i[-2]==name[0]:\n",
332 |     "            print(i[-2],name[0])\n",
333 |     "            for j in i[0:116]:\n",
334 |     "                output.append(j)\n",
335 |     "            output.append(i[-2])\n",
336 |     "            output.append(i[-1])\n",
337 |     "            \n",
338 |     "    data_to_write_full_train.append(output)\n",
339 |     "\n",
340 |     "myFile = open('final_hybrid_full_train.csv', 'w', newline='')  \n",
341 |     "\n",
342 |     "with myFile:  \n",
343 |     "    writer = csv.writer(myFile)\n",
344 |     "    writer.writerows(data_to_write_full_train)"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {},
350 |    "source": [
351 |     "Now create the second model (CNN with pretrained encoder layers) from which feature of byte file will be extracted"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 12,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "auto=autoencoder().to(device).float()\n",
361 |     "auto1=autoencoder1().to(device).float()\n",
362 |     "\n",
363 |     "model=Cnn_Stacking(auto,auto1).to(device).float()\n"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "Loading the best model state during FINE TUNING"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": 13,
376 |    "metadata": {},
377 |    "outputs": [],
378 |    "source": [
379 |     "path_to_checkpoint='...//cnn_auto_saved_model//'\n",
380 |     "\n",
381 |     "scores=[]\n",
382 |     "List= os.listdir(path_to_checkpoint)\n",
383 |     "for i in range(len(List)):\n",
384 |     "    scores.append(float(List[i].split()[0]))\n",
385 |     "max_idx= np.argmin(scores)\n",
386 |     "min_loss_path= path_to_checkpoint+List[max_idx]\n",
387 |     "if (len(List)!=0):        \n",
388 |     "    checkpoint= torch.load(min_loss_path)\n",
389 |     "    model.load_state_dict(checkpoint['state_dict'])\n",
390 |     "    model.eval()\n",
391 |     "    print(\"model initialized\")"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "markdown",
396 |    "metadata": {},
397 |    "source": [
398 |     "Loading data from above processed csv ( extracted features(byte file) + 116 features(asm) ) "
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 14,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "path_tr='final_hybrid_train.csv'\n",
408 |     "\n",
409 |     "path_futr='final_hybrid_full_train.csv'\n",
410 |     "\n",
411 |     "path_te='final_hybrid_test.csv'\n",
412 |     "\n",
413 |     "path_val='final_hybrid_valid.csv'\n",
414 |     "\n",
415 |     "train = np.genfromtxt(path_tr, delimiter=\",\", dtype=str)\n",
416 |     "\n",
417 |     "val = np.genfromtxt(path_val, delimiter=\",\", dtype=str)\n",
418 |     "\n",
419 |     "test = np.genfromtxt(path_te, delimiter=\",\", dtype=str)\n",
420 |     "\n",
421 |     "full_train = np.genfromtxt(path_futr, delimiter=\",\", dtype=str)"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "markdown",
426 |    "metadata": {},
427 |    "source": [
428 |     "Following piece of code extracts the feature from test data of byte file using fine tunned CNN and concatenate them to previous extracted and selected features to generate hybrid data"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 15,
434 |    "metadata": {},
435 |    "outputs": [],
436 |    "source": [
437 |     "data_to_write_test=[]\n",
438 |     "\n",
439 |     "for image,name,label in test_data_loader:\n",
440 |     "    image=image.to(device)\n",
441 |     "    output=model(image)\n",
442 |     "    output=F.softmax(output)\n",
443 |     "    output = output.reshape(output.size(0), -1)\n",
444 |     "    output=output.to('cpu')\n",
445 |     "    output=output.detach().numpy()\n",
446 |     "    output=output.tolist()\n",
447 |     "    output=output[0]\n",
448 |     "    for i in test:\n",
449 |     "        if i[-2]==name[0]:\n",
450 |     "            print(i[-2],name[0])\n",
451 |     "            for j in i:\n",
452 |     "                output.append(j)\n",
453 |     "            \n",
454 |     "    data_to_write_test.append(output)\n",
455 |     "\n",
456 |     "myFile = open('final_hybrid_test.csv', 'w', newline='')  \n",
457 |     "\n",
458 |     "with myFile:  \n",
459 |     "    writer = csv.writer(myFile)\n",
460 |     "    writer.writerows(data_to_write_test)\n"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "metadata": {},
466 |    "source": [
467 |     "Following piece of code extracts the feature from train data of byte file using fine tunned CNN and concatenate them to previous extracted and selected features to generate hybrid data"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": 16,
473 |    "metadata": {},
474 |    "outputs": [],
475 |    "source": [
476 |     "data_to_write_train=[]\n",
477 |     "\n",
478 |     "for image,name,label in train_data_loader:\n",
479 |     "    image=image.to(device)\n",
480 |     "    output=model(image)\n",
481 |     "    output=F.softmax(output)\n",
482 |     "    \n",
483 |     "    output = output.reshape(output.size(0), -1)\n",
484 |     "    \n",
485 |     "    output=output.to('cpu')\n",
486 |     "    output=output.detach().numpy()\n",
487 |     "    output=output.tolist()\n",
488 |     "    output=output[0]\n",
489 |     "    for i in train:\n",
490 |     "        if i[-2]==name[0]:\n",
491 |     "            print(i[-2],name[0])\n",
492 |     "            for j in i:\n",
493 |     "                output.append(j)\n",
494 |     "            \n",
495 |     "    data_to_write_train.append(output)\n",
496 |     "\n",
497 |     "myFile = open('final_hybrid_train.csv', 'w', newline='')  \n",
498 |     "\n",
499 |     "with myFile:  \n",
500 |     "    writer = csv.writer(myFile)\n",
501 |     "    writer.writerows(data_to_write_train)"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "markdown",
506 |    "metadata": {},
507 |    "source": [
508 |     "Following piece of code extracts the feature from the validation data of byte file using fine tunned CNN and concatenate them to previous extracted and selected features to generate hybrid data "
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": 17,
514 |    "metadata": {},
515 |    "outputs": [],
516 |    "source": [
517 |     "data_to_write_val=[]\n",
518 |     "\n",
519 |     "for image,name,label in valid_data_loader:\n",
520 |     "    image=image.to(device)\n",
521 |     "    output=model(image)\n",
522 |     "    output=F.softmax(output)\n",
523 |     "    \n",
524 |     "    output = output.reshape(output.size(0), -1)\n",
525 |     "    \n",
526 |     "    output=output.to('cpu')\n",
527 |     "    output=output.detach().numpy()\n",
528 |     "    output=output.tolist()\n",
529 |     "    output=output[0]\n",
530 |     "    for i in val:\n",
531 |     "        if i[-2]==name[0]:\n",
532 |     "            print(i[-2],name[0])\n",
533 |     "            for j in i:\n",
534 |     "                output.append(j)\n",
535 |     "            \n",
536 |     "    data_to_write_val.append(output)\n",
537 |     "myFile = open('final_hybrid_valid.csv', 'w', newline='')  \n",
538 |     "\n",
539 |     "with myFile:  \n",
540 |     "    writer = csv.writer(myFile)\n",
541 |     "    writer.writerows(data_to_write_val)"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "markdown",
546 |    "metadata": {},
547 |    "source": [
548 |     "Following piece of code extracts the feature from overall data of byte file using fine tunned CNN and concatenate them to previous extracted and selected features to generate hybrid data "
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": 18,
554 |    "metadata": {},
555 |    "outputs": [],
556 |    "source": [
557 |     "data_to_write_full_train=[]\n",
558 |     "\n",
559 |     "for image,name,label in fulL_train_data_loader:\n",
560 |     "    image=image.to(device)\n",
561 |     "    output=model(image)\n",
562 |     "    output=F.softmax(output)\n",
563 |     "    \n",
564 |     "    output = output.reshape(output.size(0), -1)\n",
565 |     "    \n",
566 |     "    output=output.to('cpu')\n",
567 |     "    output=output.detach().numpy()\n",
568 |     "    output=output.tolist()\n",
569 |     "    output=output[0]\n",
570 |     "    for i in full_train:\n",
571 |     "        if i[-2]==name[0]:\n",
572 |     "            print(i[-2],name[0])\n",
573 |     "            for j in i:\n",
574 |     "                output.append(j)\n",
575 |     "\n",
576 |     "    \n",
577 |     "    data_to_write_full_train.append(output)\n",
578 |     "    \n",
579 |     "myFile = open('final_hybrid_full_train.csv', 'w', newline='')  \n",
580 |     "\n",
581 |     "with myFile:  \n",
582 |     "    writer = csv.writer(myFile)\n",
583 |     "    writer.writerows(data_to_write_full_train)"
584 |    ]
585 |   }
586 |  ],
587 |  "metadata": {
588 |   "kernelspec": {
589 |    "display_name": "Python 3",
590 |    "language": "python",
591 |    "name": "python3"
592 |   },
593 |   "language_info": {
594 |    "codemirror_mode": {
595 |     "name": "ipython",
596 |     "version": 3
597 |    },
598 |    "file_extension": ".py",
599 |    "mimetype": "text/x-python",
600 |    "name": "python",
601 |    "nbconvert_exporter": "python",
602 |    "pygments_lexer": "ipython3",
603 |    "version": "3.7.2"
604 |   }
605 |  },
606 |  "nbformat": 4,
607 |  "nbformat_minor": 2
608 | }
609 | 


--------------------------------------------------------------------------------
/Hybrid (Final)/Min-max normalization (hybrid dataset).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Min-max normalization (hybrid dataset)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Importing libraries and initializing variables"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "import csv\n",
 25 |     "path_tr='final_hybrid_train.csv'\n",
 26 |     "path_futr='final_hybrid_full_train.csv'\n",
 27 |     "path_te='final_hybrid_test.csv'\n",
 28 |     "path_val='final_hybrid_valid.csv'"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "In this section of code, train data is normalized with respect to its max and min values against each feature"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "new_train_data = np.genfromtxt(path_tr, delimiter=\",\", dtype=str)\n",
 45 |     "tra_data= new_train_data[:,0:-2]\n",
 46 |     "tra_lab=new_train_data[:,-1]\n",
 47 |     "tra_name=new_train_data[:,-2]\n",
 48 |     "tra_lab=tra_lab.astype(np.int)\n",
 49 |     "tra_data=tra_data.astype(np.float)\n",
 50 |     "tra_min=np.min(tra_data,axis=0)\n",
 51 |     "tra_max=np.max(tra_data,axis=0)\n",
 52 |     "tra_dom=np.subtract(tra_max,tra_min)\n",
 53 |     "tra_dom=np.where(tra_dom == 0, 1, tra_dom)\n",
 54 |     "img = np.subtract(tra_data,tra_min)\n",
 55 |     "img = np.divide(img, tra_dom)\n",
 56 |     "img= img.tolist()\n",
 57 |     "\n",
 58 |     "for i in range(len(tra_lab)):\n",
 59 |     "    img[i].append(tra_name[i])\n",
 60 |     "    img[i].append(tra_lab[i])\n",
 61 |     "\n",
 62 |     "\n",
 63 |     "myFile = open('n_final_hybrid_train.csv', 'w', newline='')  \n",
 64 |     "\n",
 65 |     "with myFile:  \n",
 66 |     "    writer = csv.writer(myFile)\n",
 67 |     "    writer.writerows(img)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "Validation data is normalized with respect to train data max and min values against each feature"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 3,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "new_valid_data = np.genfromtxt(path_val, delimiter=\",\", dtype=str)\n",
 84 |     "valid_data= new_valid_data[:,0:-2]\n",
 85 |     "valid_name=new_valid_data[:,-2]\n",
 86 |     "valid_lab=new_valid_data[:,-1]\n",
 87 |     "valid_lab=valid_lab.astype(np.int)\n",
 88 |     "valid_data=valid_data.astype(np.float)\n",
 89 |     "img = np.subtract(valid_data,tra_min)\n",
 90 |     "img = np.divide(img, tra_dom)\n",
 91 |     "img= img.tolist()\n",
 92 |     "\n",
 93 |     "for i in range(len(valid_lab)):\n",
 94 |     "    img[i].append(valid_name[i])\n",
 95 |     "    img[i].append(valid_lab[i])\n",
 96 |     "\n",
 97 |     "\n",
 98 |     "myFile = open('n_final_hybrid_valid.csv', 'w', newline='')  \n",
 99 |     "\n",
100 |     "with myFile:  \n",
101 |     "    writer = csv.writer(myFile)\n",
102 |     "    writer.writerows(img)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "In this section of code, trainfull data is normalized with respect to its max and min values against each feature"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 4,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "new_x_data = np.genfromtxt(path_futr, delimiter=\",\", dtype=str)\n",
119 |     "x_data= new_x_data[:,0:-2]\n",
120 |     "x_name=new_x_data[:,-2]\n",
121 |     "x_lab=new_x_data[:,-1]\n",
122 |     "x_lab=x_lab.astype(np.int)\n",
123 |     "x_data=x_data.astype(np.float)\n",
124 |     "x_min=np.min(x_data,axis=0)\n",
125 |     "x_max=np.max(x_data,axis=0)\n",
126 |     "x_dom=np.subtract(x_max,x_min)\n",
127 |     "x_dom=np.where(x_dom == 0, 1, x_dom)\n",
128 |     "img = np.subtract(x_data,x_min)\n",
129 |     "img = np.divide(img, x_dom)\n",
130 |     "img= img.tolist()\n",
131 |     "\n",
132 |     "for i in range(len(x_lab)):\n",
133 |     "    img[i].append(x_name[i])\n",
134 |     "    img[i].append(x_lab[i])\n",
135 |     "\n",
136 |     "\n",
137 |     "myFile = open('n_final_hybrid_full_train.csv', 'w', newline='')  \n",
138 |     "\n",
139 |     "with myFile:  \n",
140 |     "    writer = csv.writer(myFile)\n",
141 |     "    writer.writerows(img)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "Test data is normalized with respect to trainfull data max and min values against each feature"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 12,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "new_test_data = np.genfromtxt(path_te, delimiter=\",\", dtype=str)\n",
158 |     "test_data= new_test_data[:,0:-2]\n",
159 |     "test_lab=new_test_data[:,-1]\n",
160 |     "test_name=new_test_data[:,-2]\n",
161 |     "\n",
162 |     "test_lab=test_lab.astype(np.int)\n",
163 |     "test_data=test_data.astype(np.float)\n",
164 |     "img = np.subtract(test_data,x_min)\n",
165 |     "img = np.divide(img, x_dom)\n",
166 |     "img= img.tolist()\n",
167 |     "\n",
168 |     "for i in range(len(test_lab)):\n",
169 |     "    img[i].append(test_name[i])\n",
170 |     "    img[i].append(test_lab[i])\n",
171 |     "\n",
172 |     "\n",
173 |     "myFile = open('n_final_hybrid_test.csv', 'w', newline='')  \n",
174 |     "\n",
175 |     "with myFile:  \n",
176 |     "    writer = csv.writer(myFile)\n",
177 |     "    writer.writerows(img)"
178 |    ]
179 |   }
180 |  ],
181 |  "metadata": {
182 |   "kernelspec": {
183 |    "display_name": "Python 3",
184 |    "language": "python",
185 |    "name": "python3"
186 |   },
187 |   "language_info": {
188 |    "codemirror_mode": {
189 |     "name": "ipython",
190 |     "version": 3
191 |    },
192 |    "file_extension": ".py",
193 |    "mimetype": "text/x-python",
194 |    "name": "python",
195 |    "nbconvert_exporter": "python",
196 |    "pygments_lexer": "ipython3",
197 |    "version": "3.7.2"
198 |   }
199 |  },
200 |  "nbformat": 4,
201 |  "nbformat_minor": 2
202 | }
203 | 


--------------------------------------------------------------------------------
/Hybrid (Final)/Read me:
--------------------------------------------------------------------------------
 1 | The sequence of the tutuorials is given below:
 2 | 
 3 | 1. Creating hybrid dataset
 4 | 
 5 | 2. Min-max normalization(hybrid dataset)
 6 | 
 7 | 3. ANN-Results
 8 | 
 9 | The data use in these tutorial can be found on the Hybrid(Final) folder of following drive link:
10 | 
11 | https://drive.google.com/drive/folders/1s7EC4s_-hP9q5vEhs-3vAubspcZbBADK?usp=sharing
12 | 


--------------------------------------------------------------------------------
/Hybrid (Final)/ann_hybrid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Apr 23 14:04:01 2019
 5 | 
 6 | @author: PR-LAB
 7 | """
 8 | 
 9 | 
10 | import torch.nn as nn
11 | 
12 | class ANN(nn.Module):
13 |     def __init__(self):
14 |         super(ANN, self).__init__()
15 |         self.feature = nn.Sequential(nn.Linear(134, 100),nn.LeakyReLU(True))
16 |         
17 |         self.feature1 = nn.Sequential(nn.Linear(100, 9),nn.LeakyReLU(True))
18 |         
19 |         
20 |     def forward(self, x):
21 |         x = self.feature(x)
22 | 
23 |         x = self.feature1(x)
24 |         return x
25 | 
26 | 


--------------------------------------------------------------------------------
/Hybrid (Final)/dataloader_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Mar  4 09:15:29 2019
 5 | 
 6 | @author: PR-LAB
 7 | """
 8 | 
 9 | from torch.utils.data.dataset import Dataset
10 | from torchvision import transforms
11 | import pandas as pd
12 | import numpy as np
13 | import torch
14 | class CustomDatasetFromImages(Dataset):
15 |     
16 |     def __init__(self, csv_path):
17 |         # Transforms
18 |         self.to_tensor = transforms.ToTensor()
19 |         # Read the csv file
20 |         self.data_info = pd.read_csv(csv_path, header=None)
21 |         
22 |         # First column contains the image paths
23 |         self.image_arr = np.asarray(self.data_info.iloc[:, 0:-2])
24 |         # Second column is the labels
25 |         self.label_arr = np.asarray(self.data_info.iloc[:, -1])-1
26 |         
27 |         
28 |         # Third column is for an operation indicator
29 |         self.operation_arr = np.asarray(self.data_info.iloc[:, -2])
30 |         # Calculate len
31 |         self.data_len = len(self.data_info.index)
32 |         
33 |     def __getitem__(self, index):
34 |         # Get image name from the pandas df
35 |         single_image_name = self.image_arr[index]
36 | 
37 |         single_image_name = np.array(single_image_name)
38 |         single_image_name = np.resize(single_image_name,(32,32))
39 |         img_as_tensor = torch.Tensor(single_image_name)
40 |         img_as_tensor = torch.unsqueeze(img_as_tensor,0)
41 |         single_image_label = self.label_arr[index]
42 |         name=self.operation_arr[index]
43 |         return (img_as_tensor,name,single_image_label)
44 |         
45 | 
46 |     def __len__(self):
47 |         return self.data_len    
48 |     
49 | 
50 | 


--------------------------------------------------------------------------------
/Hybrid (Final)/final_hybrid_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Apr 23 13:54:41 2019
 5 | 
 6 | @author: PR-LAB
 7 | """
 8 | 
 9 | #!/usr/bin/env python3
10 | # -*- coding: utf-8 -*-
11 | """
12 | Created on Mon Mar  4 09:15:29 2019
13 | 
14 | @author: aqsas
15 | """
16 | 
17 | from torch.utils.data.dataset import Dataset
18 | from torchvision import transforms
19 | import pandas as pd
20 | import numpy as np
21 | import torch
22 | from PIL import Image
23 | class CSV_READER(Dataset):
24 |     def __init__(self, csv_path):
25 |         # Transforms
26 |         self.to_tensor = transforms.ToTensor()
27 |         # Read the csv file
28 |         self.data_info = pd.read_csv(csv_path, header=None)
29 |         
30 |         # First column contains the image paths
31 |         self.image_arr = np.asarray(self.data_info.iloc[:, 0:-2])
32 |         # Second column is the labels
33 |         self.label_arr = np.asarray(self.data_info.iloc[:, -1])-1
34 |         
35 |         
36 |         # Third column is for an operation indicator
37 |         self.operation_arr = np.asarray(self.data_info.iloc[:, -2])
38 |         # Calculate len
39 |         self.data_len = len(self.data_info.index)
40 |     def __getitem__(self, index):
41 |         # Get image name from the pandas df
42 |         single_image_name = self.image_arr[index]
43 |         #print(single_image_name)
44 |     
45 |         #print(len(single_image_name))
46 |         # Open image
47 |         '''
48 |         img_as_img = Image.open(single_image_name)
49 | 
50 |         # Check if there is an operation
51 |         some_operation = self.operation_arr[index]
52 |         # If there is an operation
53 |         if some_operation:
54 |             # Do some operation on image
55 |             # ...
56 |             # ...
57 |             pass
58 |         # Transform image to tensor
59 |         '''
60 |         #single_image_name =Image.fromarray(single_image_name)
61 |         #single_image_name = self.transforms(single_image_name)
62 |         #img_as_tensor = self.to_tensor(np.array(single_image_name))
63 |     
64 | 
65 |         single_image_name = np.array(single_image_name)
66 |         img_as_tensor = torch.Tensor(single_image_name)
67 |         img_as_tensor = torch.unsqueeze(img_as_tensor,0)
68 |         single_image_label = self.label_arr[index]
69 | #        name=self.operation_arr[index]
70 |         return (img_as_tensor,single_image_label)
71 |         
72 | 
73 |     def __len__(self):
74 |         return self.data_len
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Malware Detection Using Machine Learning
 2 | This repository contains the source code for detecting different type of malwares using Deep learning based Feature Extraction and Wraper based Feature Selection Technique. A research paper describing how it works is availible at "https://arxiv.org/abs/1910.10958"
 3 | 
 4 | Two major approaches we used for malware classification:
 5 | 1- Image representation of byte file
 6 | Independent of the platform
 7 | It requires No knowledge of domain like assembly instructions
 8 | 2- Hybrid feature space using both ASM and byte file
 9 | This approach is platform dependent but gives a better performance that using byte file. Requires huge resources and processing time.
10 | 
11 | The data used in these tutorial can be found on the Hybrid(Final) folder of following drive link:
12 | 
13 | https://drive.google.com/drive/folders/1s7EC4s_-hP9q5vEhs-3vAubspcZbBADK?usp=sharing
14 | 
15 | After downloading the required dataset, following is the sequence of files in the hybrid folder whose execution will lead to results.
16 | 
17 | 1. "Creating hybrid dataset"
18 | 
19 | 2. "Min-max normalization(hybrid dataset)"
20 | 
21 | 3. "ANN-Results"
22 | 
23 | The project was done under the guidance of Dr. Asifullah Khan, DCIS, PIEAS.
24 | 


--------------------------------------------------------------------------------