├── .ipynb_checkpoints └── DNA Classification Code-checkpoint.ipynb ├── DNA Classification Code.ipynb ├── README.md └── promoters.data /.ipynb_checkpoints/DNA Classification Code-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# -------------------------------- DNA Classification Project ---------------------------------------" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## About :\n", 15 | "### In this project, we will explore the world of bioinformatics by using Markov models, K-nearest neighbor (KNN) algorithms, support vector machines, and other common classifiers to classify short E. Coli DNA sequences. This project will use a dataset from the UCI Machine Learning Repository that has 106 DNA sequences, with 57 sequential nucleotides (“base-pairs”) each.\n", 16 | "\n", 17 | "\n", 18 | "\n", 19 | "\n", 20 | "\n", 21 | "#### It includes :\n", 22 | "" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 1, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# Hide warnings\n", 37 | "import warnings\n", 38 | "warnings.simplefilter('ignore')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Step 1: Importing the Dataset\n", 46 | "\n", 47 | "The following code cells will import necessary libraries and import the dataset from the UCI repository as a Pandas DataFram" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "#import and change module name\n", 57 | "import numpy as np\n", 58 | "import matplotlib.pyplot as plt\n", 59 | "import pandas as pd\n", 60 | "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'\n", 61 | "names = ['Class', 'id', 'Sequence']\n", 62 | "data = pd.read_csv(url, names = names)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "Index(['Class', 'id', 'Sequence'], dtype='object')" 74 | ] 75 | }, 76 | "execution_count": 3, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "data.columns" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/html": [ 93 | "
\n", 94 | "\n", 107 | "\n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | "
ClassidSequence
0+S10\\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1+AMPC\\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2+AROH\\t\\tgtactagagaactagtgcattagcttatttttttgttatcat...
3+DEOP2\\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4+LEU1_TRNA\\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...
\n", 149 | "
" 150 | ], 151 | "text/plain": [ 152 | " Class id Sequence\n", 153 | "0 + S10 \\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...\n", 154 | "1 + AMPC \\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...\n", 155 | "2 + AROH \\t\\tgtactagagaactagtgcattagcttatttttttgttatcat...\n", 156 | "3 + DEOP2 \\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...\n", 157 | "4 + LEU1_TRNA \\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc..." 158 | ] 159 | }, 160 | "execution_count": 4, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "data.head()" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 5, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "data": { 176 | "text/plain": [ 177 | "(106, 3)" 178 | ] 179 | }, 180 | "execution_count": 5, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "data.shape" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 6, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "Class object\n", 198 | "id object\n", 199 | "Sequence object\n", 200 | "dtype: object" 201 | ] 202 | }, 203 | "execution_count": 6, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "data.dtypes" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "## Step 2: Preprocessing the Dataset\n", 217 | "\n", 218 | "The data is not in a usable form; as a result, we will need to process it before using it to train our algorithms." 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 7, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/plain": [ 229 | "0 +\n", 230 | "1 +\n", 231 | "2 +\n", 232 | "3 +\n", 233 | "4 +\n", 234 | "Name: Class, dtype: object" 235 | ] 236 | }, 237 | "execution_count": 7, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "# Build our dataset using custom pandas dataframe\n", 244 | "clases = data.loc[:,'Class']\n", 245 | "clases.head()" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 8, 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "data": { 255 | "text/plain": [ 256 | "['\\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataatgcgcgggcttgtcgt',\n", 257 | " '\\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaatctaacgcatcgccaa',\n", 258 | " '\\t\\tgtactagagaactagtgcattagcttatttttttgttatcatgctaaccacccggcg',\n", 259 | " '\\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaatactaacaaactc',\n", 260 | " '\\ttcgataattaactattgacgaaaagctgaaaaccactagaatgcgcctccgtggtag',\n", 261 | " '\\taggggcaaggaggatggaaagaggttgccgtataaagaaactagagtccgtttaggt',\n", 262 | " '\\t\\tcagggggtggaggatttaagccatctcctgatgacgcatagtcagcccatcatgaat',\n", 263 | " '\\t\\ttttctacaaaacacttgatactgtatgagcatacagtataattgcttcaacagaaca',\n", 264 | " '\\t\\tcgacttaatatactgcgacaggacgtccgttctgtgtaaatcgcaatgaaatggttt',\n", 265 | " '\\tttttaaatttcctcttgtcaggccggaataactccctataatgcgccaccactgaca',\n", 266 | " '\\tgcaaaaataaatgcttgactctgtagcgggaaggcgtattatgcacaccccgcgccg',\n", 267 | " '\\tcctgaaattcagggttgactctgaaagaggaaagcgtaatatacgccacctcgcgac',\n", 268 | " '\\tgatcaaaaaaatacttgtgcaaaaaattgggatccctataatgcgcctccgttgaga',\n", 269 | " '\\tctgcaatttttctattgcggcctgcggagaactccctataatgcgcctccatcgaca',\n", 270 | " '\\ttttatatttttcgcttgtcaggccggaataactccctataatgcgccaccactgaca',\n", 271 | " '\\taagcaaagaaatgcttgactctgtagcgggaaggcgtattatgcacaccgccgcgcc',\n", 272 | " '\\tatgcatttttccgcttgtcttcctgagccgactccctataatgcgcctccatcgaca',\n", 273 | " '\\t\\taaacaatttcagaatagacaaaaactctgagtgtaataatgtagcctcgtgtcttgc',\n", 274 | " '\\t\\ttctcaacgtaacactttacagcggcgcgtcatttgatatgatgcgccccgcttcccg',\n", 275 | " '\\t\\tgcaaataatcaatgtggacttttctgccgtgattatagacacttttgttacgcgttt',\n", 276 | " '\\t\\tgacaccatcgaatggcgcaaaacctttcgcggtatggcatgatagcgcccggaagag',\n", 277 | " '\\t\\taaaaacgtcatcgcttgcattagaaaggtttctggccgaccttataaccattaatta',\n", 278 | " '\\t\\ttctgaaatgagctgttgacaattaatcatcgaactagttaactagtacgcaagttca',\n", 279 | " '\\taccggaagaaaaccgtgacattttaacacgtttgttacaaggtaaaggcgacgccgc',\n", 280 | " '\\t\\taaattaaaattttattgacttaggtcactaaatactttaaccaatataggcatagcg',\n", 281 | " '\\t\\tttgtcataatcgacttgtaaaccaaattgaaaagatttaggtttacaagtctacacc',\n", 282 | " '\\t\\tcatcctcgcaccagtcgacgacggtttacgctttacgtatagtggcgacaatttttt',\n", 283 | " '\\ttccagtataatttgttggcataattaagtacgacgagtaaaattacatacctgcccg',\n", 284 | " '\\tacagttatccactattcctgtggataaccatgtgtattagagttagaaaacacgagg',\n", 285 | " '\\t\\ttgtgcagtttatggttccaaaatcgccttttgctgtatatactcacagcataactgt',\n", 286 | " '\\tctgttgttcagtttttgagttgtgtataacccctcattctgatcccagcttatacgg',\n", 287 | " '\\tattacaaaaagtgctttctgaactgaacaaaaaagagtaaagttagtcgcgtagggt',\n", 288 | " '\\tatgcgcaacgcggggtgacaagggcgcgcaaaccctctatactgcgcgccgaagctg',\n", 289 | " '\\t\\ttaaaaaactaacagttgtcagcctgtcccgcttataagatcatacgccgttatacgt',\n", 290 | " '\\t\\tatgcaattttttagttgcatgaactcgcatgtctccatagaatgcgcgctacttgat',\n", 291 | " '\\tccttgaaaaagaggttgacgctgcaaggctctatacgcataatgcgccccgcaacgc',\n", 292 | " '\\t\\ttcgttgtatatttcttgacaccttttcggcatcgccctaaaattcggcgtcctcata',\n", 293 | " '\\t\\tccgtttattttttctacccatatccttgaagcggtgttataatgccgcgccctcgat',\n", 294 | " '\\t\\tttcgcatatttttcttgcaaagttgggttgagctggctagattagccagccaatctt',\n", 295 | " '\\t\\ttgtaaactaatgcctttacgtgggcggtgattttgtctacaatcttacccccacgta',\n", 296 | " '\\tgatcgcacgatctgtatacttatttgagtaaattaacccacgatcccagccattctt',\n", 297 | " '\\t\\taacgcatacggtattttaccttcccagtcaagaaaacttatcttattcccacttttc',\n", 298 | " '\\tttagcggatcctacctgacgctttttatcgcaactctctactgtttctccatacccg',\n", 299 | " '\\t\\tgccttctccaaaacgtgttttttgttgttaattcggtgtagacttgtaaacctaaat',\n", 300 | " '\\tcagaaacgttttattcgaacatcgatctcgtcttgtgttagaattctaacatacggt',\n", 301 | " '\\tcactaatttattccatgtcacacttttcgcatctttgttatgctatggttatttcat',\n", 302 | " '\\t\\tatataaaaaagttcttgctttctaacgtgaaagtggtttaggttaaaagacatcagt',\n", 303 | " '\\t\\tcaaggtagaatgctttgccttgtcggcctgattaatggcacgatagtcgcatcggat',\n", 304 | " '\\tggccaaaaaatatcttgtactatttacaaaacctatggtaactctttaggcattcct',\n", 305 | " '\\ttaggcaccccaggctttacactttatgcttccggctcgtatgttgtgtggaattgtg',\n", 306 | " '\\t\\tccatcaaaaaaatattctcaacataaaaaactttgtgtaatacttgtaacgctacat',\n", 307 | " '\\t\\ttggggacgtcgttactgatccgcacgtttatgatatgctatcgtactctttagcgag',\n", 308 | " '\\ttcagaaatattatggtgatgaactgtttttttatccagtataatttgttggcataat',\n", 309 | " '\\t\\tatatgaacgttgagactgccgctgagttatcagctgtgaacgacattctggcgtcta',\n", 310 | " '\\t\\tcgaacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctc',\n", 311 | " '\\t\\tcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatatgcg',\n", 312 | " '\\t\\tttgacctactacgccagcattttggcggtgtaagctaaccattccggttgactcaat',\n", 313 | " '\\t\\tcgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatgcag',\n", 314 | " '\\t\\tgccaatcaatcaagaacttgaagggtggtatcagccaacagcctgacatccttcgtt',\n", 315 | " '\\t\\ttggatggacgttcaacattgaggaaggcataacgctactacctgatgtttactccaa',\n", 316 | " '\\t\\tgaggtggctatgtgtatgaccgaacgagtcaatcagaccgctttgactctggtatta',\n", 317 | " '\\t\\tcgtagcgcatcagtgctttcttactgtgagtacgcaccagcgccagaggacgacgac',\n", 318 | " '\\t\\tcgaccgaagcgagcctcgtcctcaatggcctctaaacgggtcttgaggggttttttg',\n", 319 | " '\\t\\tctacggtgggtacaatatgctggatggagatgcgttcacttctggtctactgactcg',\n", 320 | " '\\t\\tatagtctcagagtcttgacctactacgccagcattttggcggtgtaagctaaccatt',\n", 321 | " '\\t\\taactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagcagcg',\n", 322 | " '\\t\\tttactgtgaacattattcgtctccgcgactacgatgagatgcctgagtgcttccgtt',\n", 323 | " '\\t\\ttattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattga',\n", 324 | " '\\t\\taacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctccg',\n", 325 | " '\\t\\taagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatggcc',\n", 326 | " '\\t\\tgaagaccacgcctcgccaccgagtagacccttagagagcatgtcagcctcgacaact',\n", 327 | " '\\t\\tttagagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccct',\n", 328 | " '\\t\\ttattcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcac',\n", 329 | " '\\t\\ttgctgaaaggaggaactatatgcgctcatacgatatgaacgttgagactgccgctga',\n", 330 | " '\\t\\tcatgaactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagc',\n", 331 | " '\\t\\tttcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcacca',\n", 332 | " '\\t\\tcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacgcgctt',\n", 333 | " '\\t\\taggaggaactacgcaaggttggaacatcggagagatgccagccagcgcacctgcacg',\n", 334 | " '\\t\\ttctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattgagga',\n", 335 | " '\\t\\ttgaagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatgg',\n", 336 | " '\\t\\tctatatgcgctcatacgatatgaacgttgagactgccgctgagttatcagctgtgaa',\n", 337 | " '\\t\\tgcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccggt',\n", 338 | " '\\t\\tatccctaatgtctacttccggtcaatccatctacgttaaccgaggtggctatgtgta',\n", 339 | " '\\t\\ttggcgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatg',\n", 340 | " '\\t\\ttctcgtggatggacgttcaacattgaggaaggcataacgctactacctgatgtttac',\n", 341 | " '\\t\\ttattggcttgctcaagcatgaactcaaggctgatacggcgagacttgcgagccttgt',\n", 342 | " '\\t\\ttagagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatg',\n", 343 | " '\\t\\tcagcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccg',\n", 344 | " '\\t\\tttacgttggcgaccgctaggactttcttgttgattttccatgcggtgttttgcgcaa',\n", 345 | " '\\t\\tacgctaacgcagatgcagcgaacgctcggcgtattctcaacaagattaaccgacaga',\n", 346 | " '\\t\\tggtgttttgcgcaatgttaatcgctttgtacacctcaggcatgtaaacgtcttcgta',\n", 347 | " '\\t\\taaccattccggttgactcaatgagcatctcgatgcagcgtactcctacatgaataga',\n", 348 | " '\\t\\tagacgtctctgcatggagtatgagatggactacggtgggtacaatatgctggatgga',\n", 349 | " '\\t\\ttgttgattttccatgcggtgttttgcgcaatgttaatcgctttgtacacctcaggca',\n", 350 | " '\\t\\ttgcacgggttgcgatagcctcagcgtattcaggtgcgagttcgatagtctcagagtc',\n", 351 | " '\\t\\taggcatgtaaacgtcttcgtagcgcatcagtgctttcttactgtgagtacgcaccag',\n", 352 | " '\\t\\tccgagtagacccttagagagcatgtcagcctcgacaacttgcataaatgctttcttg',\n", 353 | " '\\t\\tcgctaggactttcttgttgattttccatgcggtgttttgcgcaatgttaatcgcttt',\n", 354 | " '\\t\\ttatgaccgaacgagtcaatcagaccgctttgactctggtattactgtgaacattatt',\n", 355 | " '\\t\\tagagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatga',\n", 356 | " '\\t\\tgagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacg',\n", 357 | " '\\t\\tcctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatat',\n", 358 | " '\\t\\tgtattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattg',\n", 359 | " '\\t\\tcgcgactacgatgagatgcctgagtgcttccgttactggattgtcaccaaggcttcc',\n", 360 | " '\\t\\tctcgtcctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaac',\n", 361 | " '\\t\\ttaacattaataaataaggaggctctaatggcactcattagccaatcaatcaagaact']" 362 | ] 363 | }, 364 | "execution_count": 8, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "# generate list of DNA sequence\n", 371 | "sequence = list(data.loc[:, 'Sequence'])\n", 372 | "sequence" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 9, 378 | "metadata": {}, 379 | "outputs": [ 380 | { 381 | "data": { 382 | "text/plain": [ 383 | "['t',\n", 384 | " 'a',\n", 385 | " 'c',\n", 386 | " 't',\n", 387 | " 'a',\n", 388 | " 'g',\n", 389 | " 'c',\n", 390 | " 'a',\n", 391 | " 'a',\n", 392 | " 't',\n", 393 | " 'a',\n", 394 | " 'c',\n", 395 | " 'g',\n", 396 | " 'c',\n", 397 | " 't',\n", 398 | " 't',\n", 399 | " 'g',\n", 400 | " 'c',\n", 401 | " 'g',\n", 402 | " 't',\n", 403 | " 't',\n", 404 | " 'c',\n", 405 | " 'g',\n", 406 | " 'g',\n", 407 | " 't',\n", 408 | " 'g',\n", 409 | " 'g',\n", 410 | " 't',\n", 411 | " 't',\n", 412 | " 'a',\n", 413 | " 'a',\n", 414 | " 'g',\n", 415 | " 't',\n", 416 | " 'a',\n", 417 | " 't',\n", 418 | " 'g',\n", 419 | " 't',\n", 420 | " 'a',\n", 421 | " 't',\n", 422 | " 'a',\n", 423 | " 'a',\n", 424 | " 't',\n", 425 | " 'g',\n", 426 | " 'c',\n", 427 | " 'g',\n", 428 | " 'c',\n", 429 | " 'g',\n", 430 | " 'g',\n", 431 | " 'g',\n", 432 | " 'c',\n", 433 | " 't',\n", 434 | " 't',\n", 435 | " 'g',\n", 436 | " 't',\n", 437 | " 'c',\n", 438 | " 'g',\n", 439 | " 't',\n", 440 | " '+']" 441 | ] 442 | }, 443 | "execution_count": 9, 444 | "metadata": {}, 445 | "output_type": "execute_result" 446 | } 447 | ], 448 | "source": [ 449 | "#Remove tab from each sequence\n", 450 | "dic = {}\n", 451 | "for i, seq in enumerate(sequence):\n", 452 | " nucleotides = list(seq)\n", 453 | " nucleotides = [char for char in nucleotides if char != '\\t']\n", 454 | " #append class assignment\n", 455 | " nucleotides.append(clases[i])\n", 456 | " \n", 457 | " dic[i] = nucleotides\n", 458 | "dic[0] " 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 10, 464 | "metadata": {}, 465 | "outputs": [ 466 | { 467 | "data": { 468 | "text/html": [ 469 | "
\n", 470 | "\n", 483 | "\n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | "
0123456789...96979899100101102103104105
0ttgatactct...cctagcgcct
1agtacgatgt...cgagactgta
2ccatgggtat...gctagtacca
3ttctaggcct...atggactggc
4aatgtggtta...gaaggatata
\n", 633 | "

5 rows × 106 columns

\n", 634 | "
" 635 | ], 636 | "text/plain": [ 637 | " 0 1 2 3 4 5 6 7 8 9 ... 96 97 98 99 100 101 102 \\\n", 638 | "0 t t g a t a c t c t ... c c t a g c g \n", 639 | "1 a g t a c g a t g t ... c g a g a c t \n", 640 | "2 c c a t g g g t a t ... g c t a g t a \n", 641 | "3 t t c t a g g c c t ... a t g g a c t \n", 642 | "4 a a t g t g g t t a ... g a a g g a t \n", 643 | "\n", 644 | " 103 104 105 \n", 645 | "0 c c t \n", 646 | "1 g t a \n", 647 | "2 c c a \n", 648 | "3 g g c \n", 649 | "4 a t a \n", 650 | "\n", 651 | "[5 rows x 106 columns]" 652 | ] 653 | }, 654 | "execution_count": 10, 655 | "metadata": {}, 656 | "output_type": "execute_result" 657 | } 658 | ], 659 | "source": [ 660 | "# Convert Dict object into dataframe\n", 661 | "df = pd.DataFrame(dic)\n", 662 | "df.head()\n" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 11, 668 | "metadata": {}, 669 | "outputs": [ 670 | { 671 | "data": { 672 | "text/html": [ 673 | "
\n", 674 | "\n", 687 | "\n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | "
0123456789...48495051525354555657
0tactagcaat...gcttgtcgt+
1tgctatcctg...catcgccaa+
2gtactagaga...cacccggcg+
3aattgtgatg...aacaaactc+
4tcgataatta...ccgtggtag+
\n", 837 | "

5 rows × 58 columns

\n", 838 | "
" 839 | ], 840 | "text/plain": [ 841 | " 0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 57\n", 842 | "0 t a c t a g c a a t ... g c t t g t c g t +\n", 843 | "1 t g c t a t c c t g ... c a t c g c c a a +\n", 844 | "2 g t a c t a g a g a ... c a c c c g g c g +\n", 845 | "3 a a t t g t g a t g ... a a c a a a c t c +\n", 846 | "4 t c g a t a a t t a ... c c g t g g t a g +\n", 847 | "\n", 848 | "[5 rows x 58 columns]" 849 | ] 850 | }, 851 | "execution_count": 11, 852 | "metadata": {}, 853 | "output_type": "execute_result" 854 | } 855 | ], 856 | "source": [ 857 | "# transpose dataframe into correct format\n", 858 | "df = df.transpose()\n", 859 | "df.head()" 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": 12, 865 | "metadata": {}, 866 | "outputs": [ 867 | { 868 | "data": { 869 | "text/plain": [ 870 | "RangeIndex(start=0, stop=58, step=1)" 871 | ] 872 | }, 873 | "execution_count": 12, 874 | "metadata": {}, 875 | "output_type": "execute_result" 876 | } 877 | ], 878 | "source": [ 879 | "df.columns" 880 | ] 881 | }, 882 | { 883 | "cell_type": "code", 884 | "execution_count": 13, 885 | "metadata": {}, 886 | "outputs": [], 887 | "source": [ 888 | "# Rename\n", 889 | "df.rename(columns = {57:'Class'}, inplace = True)" 890 | ] 891 | }, 892 | { 893 | "cell_type": "code", 894 | "execution_count": 14, 895 | "metadata": {}, 896 | "outputs": [ 897 | { 898 | "data": { 899 | "text/plain": [ 900 | "Index([ 0, 1, 2, 3, 4, 5, 6, 7,\n", 901 | " 8, 9, 10, 11, 12, 13, 14, 15,\n", 902 | " 16, 17, 18, 19, 20, 21, 22, 23,\n", 903 | " 24, 25, 26, 27, 28, 29, 30, 31,\n", 904 | " 32, 33, 34, 35, 36, 37, 38, 39,\n", 905 | " 40, 41, 42, 43, 44, 45, 46, 47,\n", 906 | " 48, 49, 50, 51, 52, 53, 54, 55,\n", 907 | " 56, 'Class'],\n", 908 | " dtype='object')" 909 | ] 910 | }, 911 | "execution_count": 14, 912 | "metadata": {}, 913 | "output_type": "execute_result" 914 | } 915 | ], 916 | "source": [ 917 | "df.columns" 918 | ] 919 | }, 920 | { 921 | "cell_type": "code", 922 | "execution_count": 15, 923 | "metadata": {}, 924 | "outputs": [ 925 | { 926 | "data": { 927 | "text/html": [ 928 | "
\n", 929 | "\n", 942 | "\n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | "
0123456789...484950515253545556Class
0tactagcaat...gcttgtcgt+
1tgctatcctg...catcgccaa+
2gtactagaga...cacccggcg+
3aattgtgatg...aacaaactc+
4tcgataatta...ccgtggtag+
\n", 1092 | "

5 rows × 58 columns

\n", 1093 | "
" 1094 | ], 1095 | "text/plain": [ 1096 | " 0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 Class\n", 1097 | "0 t a c t a g c a a t ... g c t t g t c g t +\n", 1098 | "1 t g c t a t c c t g ... c a t c g c c a a +\n", 1099 | "2 g t a c t a g a g a ... c a c c c g g c g +\n", 1100 | "3 a a t t g t g a t g ... a a c a a a c t c +\n", 1101 | "4 t c g a t a a t t a ... c c g t g g t a g +\n", 1102 | "\n", 1103 | "[5 rows x 58 columns]" 1104 | ] 1105 | }, 1106 | "execution_count": 15, 1107 | "metadata": {}, 1108 | "output_type": "execute_result" 1109 | } 1110 | ], 1111 | "source": [ 1112 | "df.head()" 1113 | ] 1114 | }, 1115 | { 1116 | "cell_type": "code", 1117 | "execution_count": 16, 1118 | "metadata": {}, 1119 | "outputs": [ 1120 | { 1121 | "data": { 1122 | "text/html": [ 1123 | "
\n", 1124 | "\n", 1137 | "\n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | "
0_a0_c0_g0_t1_a1_c1_g1_t2_a2_c...55_a55_c55_g55_t56_a56_c56_g56_tClass_+Class_-
00001100001...0010000110
10001001001...1000100010
20010000110...0100001010
31000100000...0001010010
40001010000...1000001010
\n", 1287 | "

5 rows × 230 columns

\n", 1288 | "
" 1289 | ], 1290 | "text/plain": [ 1291 | " 0_a 0_c 0_g 0_t 1_a 1_c 1_g 1_t 2_a 2_c ... 55_a 55_c 55_g \\\n", 1292 | "0 0 0 0 1 1 0 0 0 0 1 ... 0 0 1 \n", 1293 | "1 0 0 0 1 0 0 1 0 0 1 ... 1 0 0 \n", 1294 | "2 0 0 1 0 0 0 0 1 1 0 ... 0 1 0 \n", 1295 | "3 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 \n", 1296 | "4 0 0 0 1 0 1 0 0 0 0 ... 1 0 0 \n", 1297 | "\n", 1298 | " 55_t 56_a 56_c 56_g 56_t Class_+ Class_- \n", 1299 | "0 0 0 0 0 1 1 0 \n", 1300 | "1 0 1 0 0 0 1 0 \n", 1301 | "2 0 0 0 1 0 1 0 \n", 1302 | "3 1 0 1 0 0 1 0 \n", 1303 | "4 0 0 0 1 0 1 0 \n", 1304 | "\n", 1305 | "[5 rows x 230 columns]" 1306 | ] 1307 | }, 1308 | "execution_count": 16, 1309 | "metadata": {}, 1310 | "output_type": "execute_result" 1311 | } 1312 | ], 1313 | "source": [ 1314 | "#Encoding\n", 1315 | "numerical_df = pd.get_dummies(df)\n", 1316 | "numerical_df.head()" 1317 | ] 1318 | }, 1319 | { 1320 | "cell_type": "code", 1321 | "execution_count": 17, 1322 | "metadata": {}, 1323 | "outputs": [ 1324 | { 1325 | "data": { 1326 | "text/html": [ 1327 | "
\n", 1328 | "\n", 1341 | "\n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | "
0_a0_c0_g0_t1_a1_c1_g1_t2_a2_c...54_t55_a55_c55_g55_t56_a56_c56_g56_tClass_+
00001100001...0001000011
10001001001...0100010001
20010000110...0010000101
31000100000...0000101001
40001010000...1100000101
\n", 1491 | "

5 rows × 229 columns

\n", 1492 | "
" 1493 | ], 1494 | "text/plain": [ 1495 | " 0_a 0_c 0_g 0_t 1_a 1_c 1_g 1_t 2_a 2_c ... 54_t 55_a 55_c \\\n", 1496 | "0 0 0 0 1 1 0 0 0 0 1 ... 0 0 0 \n", 1497 | "1 0 0 0 1 0 0 1 0 0 1 ... 0 1 0 \n", 1498 | "2 0 0 1 0 0 0 0 1 1 0 ... 0 0 1 \n", 1499 | "3 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 \n", 1500 | "4 0 0 0 1 0 1 0 0 0 0 ... 1 1 0 \n", 1501 | "\n", 1502 | " 55_g 55_t 56_a 56_c 56_g 56_t Class_+ \n", 1503 | "0 1 0 0 0 0 1 1 \n", 1504 | "1 0 0 1 0 0 0 1 \n", 1505 | "2 0 0 0 0 1 0 1 \n", 1506 | "3 0 1 0 1 0 0 1 \n", 1507 | "4 0 0 0 0 1 0 1 \n", 1508 | "\n", 1509 | "[5 rows x 229 columns]" 1510 | ] 1511 | }, 1512 | "execution_count": 17, 1513 | "metadata": {}, 1514 | "output_type": "execute_result" 1515 | } 1516 | ], 1517 | "source": [ 1518 | "# Drop class_- or Class_+ either of one\n", 1519 | "numerical_df.drop('Class_-', axis = 1, inplace = True)\n", 1520 | "numerical_df.head()" 1521 | ] 1522 | }, 1523 | { 1524 | "cell_type": "code", 1525 | "execution_count": 18, 1526 | "metadata": {}, 1527 | "outputs": [], 1528 | "source": [ 1529 | "# rename Class_+ to Class\n", 1530 | "numerical_df.rename(columns = {'Class_+':'Class'}, inplace = True)" 1531 | ] 1532 | }, 1533 | { 1534 | "cell_type": "markdown", 1535 | "metadata": {}, 1536 | "source": [ 1537 | "## Step 3: Training and Testing the Classification Algorithms\n", 1538 | "\n", 1539 | "Now that we have preprocessed the data and built our training and testing datasets, we can start to deploy different classification algorithms. It's relatively easy to test multiple models; as a result, we will compare and contrast the performance of ten different algorithms." 1540 | ] 1541 | }, 1542 | { 1543 | "cell_type": "code", 1544 | "execution_count": 19, 1545 | "metadata": {}, 1546 | "outputs": [], 1547 | "source": [ 1548 | "#Importing different classifier from sklearn\n", 1549 | "from sklearn.neighbors import KNeighborsClassifier\n", 1550 | "from sklearn.tree import DecisionTreeClassifier\n", 1551 | "from sklearn import svm\n", 1552 | "from sklearn.naive_bayes import GaussianNB\n", 1553 | "from sklearn.gaussian_process.kernels import RBF\n", 1554 | "from sklearn.gaussian_process import GaussianProcessClassifier\n", 1555 | "from sklearn.neural_network import MLPClassifier\n", 1556 | "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", 1557 | "from sklearn.metrics import classification_report, accuracy_score" 1558 | ] 1559 | }, 1560 | { 1561 | "cell_type": "code", 1562 | "execution_count": 20, 1563 | "metadata": {}, 1564 | "outputs": [], 1565 | "source": [ 1566 | "from sklearn.model_selection import train_test_split\n", 1567 | "X = numerical_df.drop(['Class'], axis = 1).values\n", 1568 | "y = numerical_df['Class'].values\n", 1569 | "\n", 1570 | "#define a seed for reproducibility\n", 1571 | "seed = 1\n", 1572 | "\n", 1573 | "# Splitting data into training and testing data\n", 1574 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = seed)\n" 1575 | ] 1576 | }, 1577 | { 1578 | "cell_type": "code", 1579 | "execution_count": 21, 1580 | "metadata": {}, 1581 | "outputs": [ 1582 | { 1583 | "name": "stdout", 1584 | "output_type": "stream", 1585 | "text": [ 1586 | "K Nearest Neighbors: 0.8232142857142858 (0.11390841738440759)\n", 1587 | "Gaussian Process: 0.8732142857142857 (0.05615780426255853)\n", 1588 | "Decision Tree: 0.7482142857142857 (0.19256757361550586)\n", 1589 | "Random Forest: 0.6428571428571429 (0.1326726830071908)\n", 1590 | "Neural Net: 0.8625 (0.11792476415070755)\n", 1591 | "AddaBoost: 0.9125 (0.1125)\n", 1592 | "Naive Bayes: 0.8375 (0.1375)\n", 1593 | "SVM Linear: 0.85 (0.10897247358851683)\n", 1594 | "SVM RBF: 0.7375 (0.11792476415070755)\n", 1595 | "SVM Sigmoid: 0.5696428571428571 (0.1592092225048921)\n" 1596 | ] 1597 | } 1598 | ], 1599 | "source": [ 1600 | "# Define scoring method\n", 1601 | "scoring = 'accuracy'\n", 1602 | "# Model building to train\n", 1603 | "names = ['K Nearest Neighbors', 'Gaussian Process', 'Decision Tree', 'Random Forest', 'Neural Net', 'AddaBoost', 'Naive Bayes', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']\n", 1604 | "Classifiers = [\n", 1605 | " KNeighborsClassifier(n_neighbors = 3),\n", 1606 | " GaussianProcessClassifier(1.0*RBF(1.0)),\n", 1607 | " DecisionTreeClassifier(max_depth = 5),\n", 1608 | " RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features = 1 ),\n", 1609 | " MLPClassifier(alpha = 1),\n", 1610 | " AdaBoostClassifier(),\n", 1611 | " GaussianNB(),\n", 1612 | " svm.SVC(kernel = 'linear'),\n", 1613 | " svm.SVC(kernel = 'rbf'),\n", 1614 | " svm.SVC(kernel = 'sigmoid')\n", 1615 | " \n", 1616 | " ]\n", 1617 | "models = zip(names, Classifiers)\n", 1618 | "# import KFold\n", 1619 | "from sklearn.model_selection import KFold, cross_val_score\n", 1620 | "\n", 1621 | "names = []\n", 1622 | "result = []\n", 1623 | "for name, model in models:\n", 1624 | " kfold = KFold(n_splits = 10, random_state = 1)\n", 1625 | " cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')\n", 1626 | " result.append(cv_results)\n", 1627 | " names.append(name)\n", 1628 | " msg = \"{0}: {1} ({2})\".format(name, cv_results.mean(), cv_results.std())\n", 1629 | " print(msg)" 1630 | ] 1631 | }, 1632 | { 1633 | "cell_type": "markdown", 1634 | "metadata": {}, 1635 | "source": [ 1636 | "## Step 4 : Model Evaluation\n", 1637 | "\n", 1638 | "Now that we will evaluate our classification algorithms using accuracy score and classification report." 1639 | ] 1640 | }, 1641 | { 1642 | "cell_type": "code", 1643 | "execution_count": 22, 1644 | "metadata": {}, 1645 | "outputs": [ 1646 | { 1647 | "name": "stdout", 1648 | "output_type": "stream", 1649 | "text": [ 1650 | "K Nearest Neighbors\n", 1651 | "0.7777777777777778\n", 1652 | " precision recall f1-score support\n", 1653 | "\n", 1654 | " 0 1.00 0.65 0.79 17\n", 1655 | " 1 0.62 1.00 0.77 10\n", 1656 | "\n", 1657 | " accuracy 0.78 27\n", 1658 | " macro avg 0.81 0.82 0.78 27\n", 1659 | "weighted avg 0.86 0.78 0.78 27\n", 1660 | "\n", 1661 | "Gaussian Process\n", 1662 | "0.8888888888888888\n", 1663 | " precision recall f1-score support\n", 1664 | "\n", 1665 | " 0 1.00 0.82 0.90 17\n", 1666 | " 1 0.77 1.00 0.87 10\n", 1667 | "\n", 1668 | " accuracy 0.89 27\n", 1669 | " macro avg 0.88 0.91 0.89 27\n", 1670 | "weighted avg 0.91 0.89 0.89 27\n", 1671 | "\n", 1672 | "Decision Tree\n", 1673 | "0.7777777777777778\n", 1674 | " precision recall f1-score support\n", 1675 | "\n", 1676 | " 0 1.00 0.65 0.79 17\n", 1677 | " 1 0.62 1.00 0.77 10\n", 1678 | "\n", 1679 | " accuracy 0.78 27\n", 1680 | " macro avg 0.81 0.82 0.78 27\n", 1681 | "weighted avg 0.86 0.78 0.78 27\n", 1682 | "\n", 1683 | "Random Forest\n", 1684 | "0.5185185185185185\n", 1685 | " precision recall f1-score support\n", 1686 | "\n", 1687 | " 0 0.70 0.41 0.52 17\n", 1688 | " 1 0.41 0.70 0.52 10\n", 1689 | "\n", 1690 | " accuracy 0.52 27\n", 1691 | " macro avg 0.56 0.56 0.52 27\n", 1692 | "weighted avg 0.59 0.52 0.52 27\n", 1693 | "\n", 1694 | "Neural Net\n", 1695 | "0.9259259259259259\n", 1696 | " precision recall f1-score support\n", 1697 | "\n", 1698 | " 0 1.00 0.88 0.94 17\n", 1699 | " 1 0.83 1.00 0.91 10\n", 1700 | "\n", 1701 | " accuracy 0.93 27\n", 1702 | " macro avg 0.92 0.94 0.92 27\n", 1703 | "weighted avg 0.94 0.93 0.93 27\n", 1704 | "\n", 1705 | "AddaBoost\n", 1706 | "0.8518518518518519\n", 1707 | " precision recall f1-score support\n", 1708 | "\n", 1709 | " 0 1.00 0.76 0.87 17\n", 1710 | " 1 0.71 1.00 0.83 10\n", 1711 | "\n", 1712 | " accuracy 0.85 27\n", 1713 | " macro avg 0.86 0.88 0.85 27\n", 1714 | "weighted avg 0.89 0.85 0.85 27\n", 1715 | "\n", 1716 | "Naive Bayes\n", 1717 | "0.9259259259259259\n", 1718 | " precision recall f1-score support\n", 1719 | "\n", 1720 | " 0 1.00 0.88 0.94 17\n", 1721 | " 1 0.83 1.00 0.91 10\n", 1722 | "\n", 1723 | " accuracy 0.93 27\n", 1724 | " macro avg 0.92 0.94 0.92 27\n", 1725 | "weighted avg 0.94 0.93 0.93 27\n", 1726 | "\n", 1727 | "SVM Linear\n", 1728 | "0.9629629629629629\n", 1729 | " precision recall f1-score support\n", 1730 | "\n", 1731 | " 0 1.00 0.94 0.97 17\n", 1732 | " 1 0.91 1.00 0.95 10\n", 1733 | "\n", 1734 | " accuracy 0.96 27\n", 1735 | " macro avg 0.95 0.97 0.96 27\n", 1736 | "weighted avg 0.97 0.96 0.96 27\n", 1737 | "\n", 1738 | "SVM RBF\n", 1739 | "0.7777777777777778\n", 1740 | " precision recall f1-score support\n", 1741 | "\n", 1742 | " 0 1.00 0.65 0.79 17\n", 1743 | " 1 0.62 1.00 0.77 10\n", 1744 | "\n", 1745 | " accuracy 0.78 27\n", 1746 | " macro avg 0.81 0.82 0.78 27\n", 1747 | "weighted avg 0.86 0.78 0.78 27\n", 1748 | "\n", 1749 | "SVM Sigmoid\n", 1750 | "0.4444444444444444\n", 1751 | " precision recall f1-score support\n", 1752 | "\n", 1753 | " 0 1.00 0.12 0.21 17\n", 1754 | " 1 0.40 1.00 0.57 10\n", 1755 | "\n", 1756 | " accuracy 0.44 27\n", 1757 | " macro avg 0.70 0.56 0.39 27\n", 1758 | "weighted avg 0.78 0.44 0.34 27\n", 1759 | "\n" 1760 | ] 1761 | } 1762 | ], 1763 | "source": [ 1764 | "#Test the algorithm on the test data set\n", 1765 | "models = zip(names, Classifiers)\n", 1766 | "for name, model in models:\n", 1767 | " model.fit(X_train, y_train)\n", 1768 | " y_pred = model.predict(X_test)\n", 1769 | " print(name)\n", 1770 | " print(accuracy_score(y_test, y_pred))\n", 1771 | " print(classification_report(y_test, y_pred))\n", 1772 | " " 1773 | ] 1774 | }, 1775 | { 1776 | "cell_type": "markdown", 1777 | "metadata": {}, 1778 | "source": [ 1779 | "## Conclusion : " 1780 | ] 1781 | }, 1782 | { 1783 | "cell_type": "markdown", 1784 | "metadata": {}, 1785 | "source": [ 1786 | "#### From above report, Support Vector Machine with 'linear' kernel performed best with F1_score = 0.96 on testing data." 1787 | ] 1788 | }, 1789 | { 1790 | "cell_type": "markdown", 1791 | "metadata": {}, 1792 | "source": [ 1793 | "### Thanks !" 1794 | ] 1795 | } 1796 | ], 1797 | "metadata": { 1798 | "kernelspec": { 1799 | "display_name": "Python 3", 1800 | "language": "python", 1801 | "name": "python3" 1802 | }, 1803 | "language_info": { 1804 | "codemirror_mode": { 1805 | "name": "ipython", 1806 | "version": 3 1807 | }, 1808 | "file_extension": ".py", 1809 | "mimetype": "text/x-python", 1810 | "name": "python", 1811 | "nbconvert_exporter": "python", 1812 | "pygments_lexer": "ipython3", 1813 | "version": "3.7.3" 1814 | } 1815 | }, 1816 | "nbformat": 4, 1817 | "nbformat_minor": 2 1818 | } 1819 | -------------------------------------------------------------------------------- /DNA Classification Code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# DNA Classification Using Machine Learning " 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## About :\n", 15 | "In this project, we will explore the world of bioinformatics by using Markov models, K-nearest neighbor (KNN) algorithms, support vector machines, and other common classifiers to classify short E. Coli DNA sequences. This project will use a dataset from the UCI Machine Learning Repository that has 106 DNA sequences, with 57 sequential nucleotides (“base-pairs”) each.\n", 16 | "\n", 17 | "\n", 18 | "\n", 19 | "\n", 20 | "\n", 21 | "It includes :\n", 22 | "" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 1, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# Hide warnings\n", 37 | "import warnings\n", 38 | "warnings.simplefilter('ignore')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Step 1: Importing the Dataset\n", 46 | "\n", 47 | "The following code cells will import necessary libraries and import the dataset from the UCI repository as a Pandas DataFram" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "#import and change module name\n", 57 | "import numpy as np\n", 58 | "import matplotlib.pyplot as plt\n", 59 | "import pandas as pd\n", 60 | "url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'\n", 61 | "names = ['Class', 'id', 'Sequence']\n", 62 | "data = pd.read_csv(url, names = names)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "Index(['Class', 'id', 'Sequence'], dtype='object')" 74 | ] 75 | }, 76 | "execution_count": 3, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "data.columns" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/html": [ 93 | "
\n", 94 | "\n", 107 | "\n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | "
ClassidSequence
0+S10\\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1+AMPC\\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2+AROH\\t\\tgtactagagaactagtgcattagcttatttttttgttatcat...
3+DEOP2\\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4+LEU1_TRNA\\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...
\n", 149 | "
" 150 | ], 151 | "text/plain": [ 152 | " Class id Sequence\n", 153 | "0 + S10 \\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...\n", 154 | "1 + AMPC \\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...\n", 155 | "2 + AROH \\t\\tgtactagagaactagtgcattagcttatttttttgttatcat...\n", 156 | "3 + DEOP2 \\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...\n", 157 | "4 + LEU1_TRNA \\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc..." 158 | ] 159 | }, 160 | "execution_count": 4, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "data.head()" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 5, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "data": { 176 | "text/plain": [ 177 | "(106, 3)" 178 | ] 179 | }, 180 | "execution_count": 5, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "data.shape" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 6, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "Class object\n", 198 | "id object\n", 199 | "Sequence object\n", 200 | "dtype: object" 201 | ] 202 | }, 203 | "execution_count": 6, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "data.dtypes" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "## Step 2: Preprocessing the Dataset\n", 217 | "\n", 218 | "The data is not in a usable form; as a result, we will need to process it before using it to train our algorithms." 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 7, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/plain": [ 229 | "0 +\n", 230 | "1 +\n", 231 | "2 +\n", 232 | "3 +\n", 233 | "4 +\n", 234 | "Name: Class, dtype: object" 235 | ] 236 | }, 237 | "execution_count": 7, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "# Build our dataset using custom pandas dataframe\n", 244 | "clases = data.loc[:,'Class']\n", 245 | "clases.head()" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 8, 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "data": { 255 | "text/plain": [ 256 | "['\\t\\ttactagcaatacgcttgcgttcggtggttaagtatgtataatgcgcgggcttgtcgt',\n", 257 | " '\\t\\ttgctatcctgacagttgtcacgctgattggtgtcgttacaatctaacgcatcgccaa',\n", 258 | " '\\t\\tgtactagagaactagtgcattagcttatttttttgttatcatgctaaccacccggcg',\n", 259 | " '\\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaatactaacaaactc',\n", 260 | " '\\ttcgataattaactattgacgaaaagctgaaaaccactagaatgcgcctccgtggtag',\n", 261 | " '\\taggggcaaggaggatggaaagaggttgccgtataaagaaactagagtccgtttaggt',\n", 262 | " '\\t\\tcagggggtggaggatttaagccatctcctgatgacgcatagtcagcccatcatgaat',\n", 263 | " '\\t\\ttttctacaaaacacttgatactgtatgagcatacagtataattgcttcaacagaaca',\n", 264 | " '\\t\\tcgacttaatatactgcgacaggacgtccgttctgtgtaaatcgcaatgaaatggttt',\n", 265 | " '\\tttttaaatttcctcttgtcaggccggaataactccctataatgcgccaccactgaca',\n", 266 | " '\\tgcaaaaataaatgcttgactctgtagcgggaaggcgtattatgcacaccccgcgccg',\n", 267 | " '\\tcctgaaattcagggttgactctgaaagaggaaagcgtaatatacgccacctcgcgac',\n", 268 | " '\\tgatcaaaaaaatacttgtgcaaaaaattgggatccctataatgcgcctccgttgaga',\n", 269 | " '\\tctgcaatttttctattgcggcctgcggagaactccctataatgcgcctccatcgaca',\n", 270 | " '\\ttttatatttttcgcttgtcaggccggaataactccctataatgcgccaccactgaca',\n", 271 | " '\\taagcaaagaaatgcttgactctgtagcgggaaggcgtattatgcacaccgccgcgcc',\n", 272 | " '\\tatgcatttttccgcttgtcttcctgagccgactccctataatgcgcctccatcgaca',\n", 273 | " '\\t\\taaacaatttcagaatagacaaaaactctgagtgtaataatgtagcctcgtgtcttgc',\n", 274 | " '\\t\\ttctcaacgtaacactttacagcggcgcgtcatttgatatgatgcgccccgcttcccg',\n", 275 | " '\\t\\tgcaaataatcaatgtggacttttctgccgtgattatagacacttttgttacgcgttt',\n", 276 | " '\\t\\tgacaccatcgaatggcgcaaaacctttcgcggtatggcatgatagcgcccggaagag',\n", 277 | " '\\t\\taaaaacgtcatcgcttgcattagaaaggtttctggccgaccttataaccattaatta',\n", 278 | " '\\t\\ttctgaaatgagctgttgacaattaatcatcgaactagttaactagtacgcaagttca',\n", 279 | " '\\taccggaagaaaaccgtgacattttaacacgtttgttacaaggtaaaggcgacgccgc',\n", 280 | " '\\t\\taaattaaaattttattgacttaggtcactaaatactttaaccaatataggcatagcg',\n", 281 | " '\\t\\tttgtcataatcgacttgtaaaccaaattgaaaagatttaggtttacaagtctacacc',\n", 282 | " '\\t\\tcatcctcgcaccagtcgacgacggtttacgctttacgtatagtggcgacaatttttt',\n", 283 | " '\\ttccagtataatttgttggcataattaagtacgacgagtaaaattacatacctgcccg',\n", 284 | " '\\tacagttatccactattcctgtggataaccatgtgtattagagttagaaaacacgagg',\n", 285 | " '\\t\\ttgtgcagtttatggttccaaaatcgccttttgctgtatatactcacagcataactgt',\n", 286 | " '\\tctgttgttcagtttttgagttgtgtataacccctcattctgatcccagcttatacgg',\n", 287 | " '\\tattacaaaaagtgctttctgaactgaacaaaaaagagtaaagttagtcgcgtagggt',\n", 288 | " '\\tatgcgcaacgcggggtgacaagggcgcgcaaaccctctatactgcgcgccgaagctg',\n", 289 | " '\\t\\ttaaaaaactaacagttgtcagcctgtcccgcttataagatcatacgccgttatacgt',\n", 290 | " '\\t\\tatgcaattttttagttgcatgaactcgcatgtctccatagaatgcgcgctacttgat',\n", 291 | " '\\tccttgaaaaagaggttgacgctgcaaggctctatacgcataatgcgccccgcaacgc',\n", 292 | " '\\t\\ttcgttgtatatttcttgacaccttttcggcatcgccctaaaattcggcgtcctcata',\n", 293 | " '\\t\\tccgtttattttttctacccatatccttgaagcggtgttataatgccgcgccctcgat',\n", 294 | " '\\t\\tttcgcatatttttcttgcaaagttgggttgagctggctagattagccagccaatctt',\n", 295 | " '\\t\\ttgtaaactaatgcctttacgtgggcggtgattttgtctacaatcttacccccacgta',\n", 296 | " '\\tgatcgcacgatctgtatacttatttgagtaaattaacccacgatcccagccattctt',\n", 297 | " '\\t\\taacgcatacggtattttaccttcccagtcaagaaaacttatcttattcccacttttc',\n", 298 | " '\\tttagcggatcctacctgacgctttttatcgcaactctctactgtttctccatacccg',\n", 299 | " '\\t\\tgccttctccaaaacgtgttttttgttgttaattcggtgtagacttgtaaacctaaat',\n", 300 | " '\\tcagaaacgttttattcgaacatcgatctcgtcttgtgttagaattctaacatacggt',\n", 301 | " '\\tcactaatttattccatgtcacacttttcgcatctttgttatgctatggttatttcat',\n", 302 | " '\\t\\tatataaaaaagttcttgctttctaacgtgaaagtggtttaggttaaaagacatcagt',\n", 303 | " '\\t\\tcaaggtagaatgctttgccttgtcggcctgattaatggcacgatagtcgcatcggat',\n", 304 | " '\\tggccaaaaaatatcttgtactatttacaaaacctatggtaactctttaggcattcct',\n", 305 | " '\\ttaggcaccccaggctttacactttatgcttccggctcgtatgttgtgtggaattgtg',\n", 306 | " '\\t\\tccatcaaaaaaatattctcaacataaaaaactttgtgtaatacttgtaacgctacat',\n", 307 | " '\\t\\ttggggacgtcgttactgatccgcacgtttatgatatgctatcgtactctttagcgag',\n", 308 | " '\\ttcagaaatattatggtgatgaactgtttttttatccagtataatttgttggcataat',\n", 309 | " '\\t\\tatatgaacgttgagactgccgctgagttatcagctgtgaacgacattctggcgtcta',\n", 310 | " '\\t\\tcgaacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctc',\n", 311 | " '\\t\\tcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatatgcg',\n", 312 | " '\\t\\tttgacctactacgccagcattttggcggtgtaagctaaccattccggttgactcaat',\n", 313 | " '\\t\\tcgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatgcag',\n", 314 | " '\\t\\tgccaatcaatcaagaacttgaagggtggtatcagccaacagcctgacatccttcgtt',\n", 315 | " '\\t\\ttggatggacgttcaacattgaggaaggcataacgctactacctgatgtttactccaa',\n", 316 | " '\\t\\tgaggtggctatgtgtatgaccgaacgagtcaatcagaccgctttgactctggtatta',\n", 317 | " '\\t\\tcgtagcgcatcagtgctttcttactgtgagtacgcaccagcgccagaggacgacgac',\n", 318 | " '\\t\\tcgaccgaagcgagcctcgtcctcaatggcctctaaacgggtcttgaggggttttttg',\n", 319 | " '\\t\\tctacggtgggtacaatatgctggatggagatgcgttcacttctggtctactgactcg',\n", 320 | " '\\t\\tatagtctcagagtcttgacctactacgccagcattttggcggtgtaagctaaccatt',\n", 321 | " '\\t\\taactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagcagcg',\n", 322 | " '\\t\\tttactgtgaacattattcgtctccgcgactacgatgagatgcctgagtgcttccgtt',\n", 323 | " '\\t\\ttattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattga',\n", 324 | " '\\t\\taacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctccg',\n", 325 | " '\\t\\taagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatggcc',\n", 326 | " '\\t\\tgaagaccacgcctcgccaccgagtagacccttagagagcatgtcagcctcgacaact',\n", 327 | " '\\t\\tttagagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccct',\n", 328 | " '\\t\\ttattcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcac',\n", 329 | " '\\t\\ttgctgaaaggaggaactatatgcgctcatacgatatgaacgttgagactgccgctga',\n", 330 | " '\\t\\tcatgaactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagc',\n", 331 | " '\\t\\tttcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcacca',\n", 332 | " '\\t\\tcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacgcgctt',\n", 333 | " '\\t\\taggaggaactacgcaaggttggaacatcggagagatgccagccagcgcacctgcacg',\n", 334 | " '\\t\\ttctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattgagga',\n", 335 | " '\\t\\ttgaagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatgg',\n", 336 | " '\\t\\tctatatgcgctcatacgatatgaacgttgagactgccgctgagttatcagctgtgaa',\n", 337 | " '\\t\\tgcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccggt',\n", 338 | " '\\t\\tatccctaatgtctacttccggtcaatccatctacgttaaccgaggtggctatgtgta',\n", 339 | " '\\t\\ttggcgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatg',\n", 340 | " '\\t\\ttctcgtggatggacgttcaacattgaggaaggcataacgctactacctgatgtttac',\n", 341 | " '\\t\\ttattggcttgctcaagcatgaactcaaggctgatacggcgagacttgcgagccttgt',\n", 342 | " '\\t\\ttagagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatg',\n", 343 | " '\\t\\tcagcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccg',\n", 344 | " '\\t\\tttacgttggcgaccgctaggactttcttgttgattttccatgcggtgttttgcgcaa',\n", 345 | " '\\t\\tacgctaacgcagatgcagcgaacgctcggcgtattctcaacaagattaaccgacaga',\n", 346 | " '\\t\\tggtgttttgcgcaatgttaatcgctttgtacacctcaggcatgtaaacgtcttcgta',\n", 347 | " '\\t\\taaccattccggttgactcaatgagcatctcgatgcagcgtactcctacatgaataga',\n", 348 | " '\\t\\tagacgtctctgcatggagtatgagatggactacggtgggtacaatatgctggatgga',\n", 349 | " '\\t\\ttgttgattttccatgcggtgttttgcgcaatgttaatcgctttgtacacctcaggca',\n", 350 | " '\\t\\ttgcacgggttgcgatagcctcagcgtattcaggtgcgagttcgatagtctcagagtc',\n", 351 | " '\\t\\taggcatgtaaacgtcttcgtagcgcatcagtgctttcttactgtgagtacgcaccag',\n", 352 | " '\\t\\tccgagtagacccttagagagcatgtcagcctcgacaacttgcataaatgctttcttg',\n", 353 | " '\\t\\tcgctaggactttcttgttgattttccatgcggtgttttgcgcaatgttaatcgcttt',\n", 354 | " '\\t\\ttatgaccgaacgagtcaatcagaccgctttgactctggtattactgtgaacattatt',\n", 355 | " '\\t\\tagagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatga',\n", 356 | " '\\t\\tgagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacg',\n", 357 | " '\\t\\tcctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatat',\n", 358 | " '\\t\\tgtattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattg',\n", 359 | " '\\t\\tcgcgactacgatgagatgcctgagtgcttccgttactggattgtcaccaaggcttcc',\n", 360 | " '\\t\\tctcgtcctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaac',\n", 361 | " '\\t\\ttaacattaataaataaggaggctctaatggcactcattagccaatcaatcaagaact']" 362 | ] 363 | }, 364 | "execution_count": 8, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "# generate list of DNA sequence\n", 371 | "sequence = list(data.loc[:, 'Sequence'])\n", 372 | "sequence" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 9, 378 | "metadata": {}, 379 | "outputs": [ 380 | { 381 | "data": { 382 | "text/plain": [ 383 | "['t',\n", 384 | " 'a',\n", 385 | " 'c',\n", 386 | " 't',\n", 387 | " 'a',\n", 388 | " 'g',\n", 389 | " 'c',\n", 390 | " 'a',\n", 391 | " 'a',\n", 392 | " 't',\n", 393 | " 'a',\n", 394 | " 'c',\n", 395 | " 'g',\n", 396 | " 'c',\n", 397 | " 't',\n", 398 | " 't',\n", 399 | " 'g',\n", 400 | " 'c',\n", 401 | " 'g',\n", 402 | " 't',\n", 403 | " 't',\n", 404 | " 'c',\n", 405 | " 'g',\n", 406 | " 'g',\n", 407 | " 't',\n", 408 | " 'g',\n", 409 | " 'g',\n", 410 | " 't',\n", 411 | " 't',\n", 412 | " 'a',\n", 413 | " 'a',\n", 414 | " 'g',\n", 415 | " 't',\n", 416 | " 'a',\n", 417 | " 't',\n", 418 | " 'g',\n", 419 | " 't',\n", 420 | " 'a',\n", 421 | " 't',\n", 422 | " 'a',\n", 423 | " 'a',\n", 424 | " 't',\n", 425 | " 'g',\n", 426 | " 'c',\n", 427 | " 'g',\n", 428 | " 'c',\n", 429 | " 'g',\n", 430 | " 'g',\n", 431 | " 'g',\n", 432 | " 'c',\n", 433 | " 't',\n", 434 | " 't',\n", 435 | " 'g',\n", 436 | " 't',\n", 437 | " 'c',\n", 438 | " 'g',\n", 439 | " 't',\n", 440 | " '+']" 441 | ] 442 | }, 443 | "execution_count": 9, 444 | "metadata": {}, 445 | "output_type": "execute_result" 446 | } 447 | ], 448 | "source": [ 449 | "#Remove tab from each sequence\n", 450 | "dic = {}\n", 451 | "for i, seq in enumerate(sequence):\n", 452 | " nucleotides = list(seq)\n", 453 | " nucleotides = [char for char in nucleotides if char != '\\t']\n", 454 | " #append class assignment\n", 455 | " nucleotides.append(clases[i])\n", 456 | " \n", 457 | " dic[i] = nucleotides\n", 458 | "dic[0] " 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 10, 464 | "metadata": {}, 465 | "outputs": [ 466 | { 467 | "data": { 468 | "text/html": [ 469 | "
\n", 470 | "\n", 483 | "\n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | "
0123456789...96979899100101102103104105
0ttgatactct...cctagcgcct
1agtacgatgt...cgagactgta
2ccatgggtat...gctagtacca
3ttctaggcct...atggactggc
4aatgtggtta...gaaggatata
\n", 633 | "

5 rows × 106 columns

\n", 634 | "
" 635 | ], 636 | "text/plain": [ 637 | " 0 1 2 3 4 5 6 7 8 9 ... 96 97 98 99 100 101 102 \\\n", 638 | "0 t t g a t a c t c t ... c c t a g c g \n", 639 | "1 a g t a c g a t g t ... c g a g a c t \n", 640 | "2 c c a t g g g t a t ... g c t a g t a \n", 641 | "3 t t c t a g g c c t ... a t g g a c t \n", 642 | "4 a a t g t g g t t a ... g a a g g a t \n", 643 | "\n", 644 | " 103 104 105 \n", 645 | "0 c c t \n", 646 | "1 g t a \n", 647 | "2 c c a \n", 648 | "3 g g c \n", 649 | "4 a t a \n", 650 | "\n", 651 | "[5 rows x 106 columns]" 652 | ] 653 | }, 654 | "execution_count": 10, 655 | "metadata": {}, 656 | "output_type": "execute_result" 657 | } 658 | ], 659 | "source": [ 660 | "# Convert Dict object into dataframe\n", 661 | "df = pd.DataFrame(dic)\n", 662 | "df.head()\n" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 11, 668 | "metadata": {}, 669 | "outputs": [ 670 | { 671 | "data": { 672 | "text/html": [ 673 | "
\n", 674 | "\n", 687 | "\n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | "
0123456789...48495051525354555657
0tactagcaat...gcttgtcgt+
1tgctatcctg...catcgccaa+
2gtactagaga...cacccggcg+
3aattgtgatg...aacaaactc+
4tcgataatta...ccgtggtag+
\n", 837 | "

5 rows × 58 columns

\n", 838 | "
" 839 | ], 840 | "text/plain": [ 841 | " 0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 57\n", 842 | "0 t a c t a g c a a t ... g c t t g t c g t +\n", 843 | "1 t g c t a t c c t g ... c a t c g c c a a +\n", 844 | "2 g t a c t a g a g a ... c a c c c g g c g +\n", 845 | "3 a a t t g t g a t g ... a a c a a a c t c +\n", 846 | "4 t c g a t a a t t a ... c c g t g g t a g +\n", 847 | "\n", 848 | "[5 rows x 58 columns]" 849 | ] 850 | }, 851 | "execution_count": 11, 852 | "metadata": {}, 853 | "output_type": "execute_result" 854 | } 855 | ], 856 | "source": [ 857 | "# transpose dataframe into correct format\n", 858 | "df = df.transpose()\n", 859 | "df.head()" 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": 12, 865 | "metadata": {}, 866 | "outputs": [ 867 | { 868 | "data": { 869 | "text/plain": [ 870 | "RangeIndex(start=0, stop=58, step=1)" 871 | ] 872 | }, 873 | "execution_count": 12, 874 | "metadata": {}, 875 | "output_type": "execute_result" 876 | } 877 | ], 878 | "source": [ 879 | "df.columns" 880 | ] 881 | }, 882 | { 883 | "cell_type": "code", 884 | "execution_count": 13, 885 | "metadata": {}, 886 | "outputs": [], 887 | "source": [ 888 | "# Rename\n", 889 | "df.rename(columns = {57:'Class'}, inplace = True)" 890 | ] 891 | }, 892 | { 893 | "cell_type": "code", 894 | "execution_count": 14, 895 | "metadata": {}, 896 | "outputs": [ 897 | { 898 | "data": { 899 | "text/plain": [ 900 | "Index([ 0, 1, 2, 3, 4, 5, 6, 7,\n", 901 | " 8, 9, 10, 11, 12, 13, 14, 15,\n", 902 | " 16, 17, 18, 19, 20, 21, 22, 23,\n", 903 | " 24, 25, 26, 27, 28, 29, 30, 31,\n", 904 | " 32, 33, 34, 35, 36, 37, 38, 39,\n", 905 | " 40, 41, 42, 43, 44, 45, 46, 47,\n", 906 | " 48, 49, 50, 51, 52, 53, 54, 55,\n", 907 | " 56, 'Class'],\n", 908 | " dtype='object')" 909 | ] 910 | }, 911 | "execution_count": 14, 912 | "metadata": {}, 913 | "output_type": "execute_result" 914 | } 915 | ], 916 | "source": [ 917 | "df.columns" 918 | ] 919 | }, 920 | { 921 | "cell_type": "code", 922 | "execution_count": 15, 923 | "metadata": {}, 924 | "outputs": [ 925 | { 926 | "data": { 927 | "text/html": [ 928 | "
\n", 929 | "\n", 942 | "\n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | "
0123456789...484950515253545556Class
0tactagcaat...gcttgtcgt+
1tgctatcctg...catcgccaa+
2gtactagaga...cacccggcg+
3aattgtgatg...aacaaactc+
4tcgataatta...ccgtggtag+
\n", 1092 | "

5 rows × 58 columns

\n", 1093 | "
" 1094 | ], 1095 | "text/plain": [ 1096 | " 0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 Class\n", 1097 | "0 t a c t a g c a a t ... g c t t g t c g t +\n", 1098 | "1 t g c t a t c c t g ... c a t c g c c a a +\n", 1099 | "2 g t a c t a g a g a ... c a c c c g g c g +\n", 1100 | "3 a a t t g t g a t g ... a a c a a a c t c +\n", 1101 | "4 t c g a t a a t t a ... c c g t g g t a g +\n", 1102 | "\n", 1103 | "[5 rows x 58 columns]" 1104 | ] 1105 | }, 1106 | "execution_count": 15, 1107 | "metadata": {}, 1108 | "output_type": "execute_result" 1109 | } 1110 | ], 1111 | "source": [ 1112 | "df.head()" 1113 | ] 1114 | }, 1115 | { 1116 | "cell_type": "code", 1117 | "execution_count": 16, 1118 | "metadata": {}, 1119 | "outputs": [ 1120 | { 1121 | "data": { 1122 | "text/html": [ 1123 | "
\n", 1124 | "\n", 1137 | "\n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | "
0_a0_c0_g0_t1_a1_c1_g1_t2_a2_c...55_a55_c55_g55_t56_a56_c56_g56_tClass_+Class_-
00001100001...0010000110
10001001001...1000100010
20010000110...0100001010
31000100000...0001010010
40001010000...1000001010
\n", 1287 | "

5 rows × 230 columns

\n", 1288 | "
" 1289 | ], 1290 | "text/plain": [ 1291 | " 0_a 0_c 0_g 0_t 1_a 1_c 1_g 1_t 2_a 2_c ... 55_a 55_c 55_g \\\n", 1292 | "0 0 0 0 1 1 0 0 0 0 1 ... 0 0 1 \n", 1293 | "1 0 0 0 1 0 0 1 0 0 1 ... 1 0 0 \n", 1294 | "2 0 0 1 0 0 0 0 1 1 0 ... 0 1 0 \n", 1295 | "3 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 \n", 1296 | "4 0 0 0 1 0 1 0 0 0 0 ... 1 0 0 \n", 1297 | "\n", 1298 | " 55_t 56_a 56_c 56_g 56_t Class_+ Class_- \n", 1299 | "0 0 0 0 0 1 1 0 \n", 1300 | "1 0 1 0 0 0 1 0 \n", 1301 | "2 0 0 0 1 0 1 0 \n", 1302 | "3 1 0 1 0 0 1 0 \n", 1303 | "4 0 0 0 1 0 1 0 \n", 1304 | "\n", 1305 | "[5 rows x 230 columns]" 1306 | ] 1307 | }, 1308 | "execution_count": 16, 1309 | "metadata": {}, 1310 | "output_type": "execute_result" 1311 | } 1312 | ], 1313 | "source": [ 1314 | "#Encoding\n", 1315 | "numerical_df = pd.get_dummies(df)\n", 1316 | "numerical_df.head()" 1317 | ] 1318 | }, 1319 | { 1320 | "cell_type": "code", 1321 | "execution_count": 17, 1322 | "metadata": {}, 1323 | "outputs": [ 1324 | { 1325 | "data": { 1326 | "text/html": [ 1327 | "
\n", 1328 | "\n", 1341 | "\n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | "
0_a0_c0_g0_t1_a1_c1_g1_t2_a2_c...54_t55_a55_c55_g55_t56_a56_c56_g56_tClass_+
00001100001...0001000011
10001001001...0100010001
20010000110...0010000101
31000100000...0000101001
40001010000...1100000101
\n", 1491 | "

5 rows × 229 columns

\n", 1492 | "
" 1493 | ], 1494 | "text/plain": [ 1495 | " 0_a 0_c 0_g 0_t 1_a 1_c 1_g 1_t 2_a 2_c ... 54_t 55_a 55_c \\\n", 1496 | "0 0 0 0 1 1 0 0 0 0 1 ... 0 0 0 \n", 1497 | "1 0 0 0 1 0 0 1 0 0 1 ... 0 1 0 \n", 1498 | "2 0 0 1 0 0 0 0 1 1 0 ... 0 0 1 \n", 1499 | "3 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 \n", 1500 | "4 0 0 0 1 0 1 0 0 0 0 ... 1 1 0 \n", 1501 | "\n", 1502 | " 55_g 55_t 56_a 56_c 56_g 56_t Class_+ \n", 1503 | "0 1 0 0 0 0 1 1 \n", 1504 | "1 0 0 1 0 0 0 1 \n", 1505 | "2 0 0 0 0 1 0 1 \n", 1506 | "3 0 1 0 1 0 0 1 \n", 1507 | "4 0 0 0 0 1 0 1 \n", 1508 | "\n", 1509 | "[5 rows x 229 columns]" 1510 | ] 1511 | }, 1512 | "execution_count": 17, 1513 | "metadata": {}, 1514 | "output_type": "execute_result" 1515 | } 1516 | ], 1517 | "source": [ 1518 | "# Drop class_- or Class_+ either of one\n", 1519 | "numerical_df.drop('Class_-', axis = 1, inplace = True)\n", 1520 | "numerical_df.head()" 1521 | ] 1522 | }, 1523 | { 1524 | "cell_type": "code", 1525 | "execution_count": 18, 1526 | "metadata": {}, 1527 | "outputs": [], 1528 | "source": [ 1529 | "# rename Class_+ to Class\n", 1530 | "numerical_df.rename(columns = {'Class_+':'Class'}, inplace = True)" 1531 | ] 1532 | }, 1533 | { 1534 | "cell_type": "markdown", 1535 | "metadata": {}, 1536 | "source": [ 1537 | "## Step 3: Training and Testing the Classification Algorithms\n", 1538 | "\n", 1539 | "Now that we have preprocessed the data and built our training and testing datasets, we can start to deploy different classification algorithms. It's relatively easy to test multiple models; as a result, we will compare and contrast the performance of ten different algorithms." 1540 | ] 1541 | }, 1542 | { 1543 | "cell_type": "code", 1544 | "execution_count": 19, 1545 | "metadata": {}, 1546 | "outputs": [], 1547 | "source": [ 1548 | "#Importing different classifier from sklearn\n", 1549 | "from sklearn.neighbors import KNeighborsClassifier\n", 1550 | "from sklearn.tree import DecisionTreeClassifier\n", 1551 | "from sklearn import svm\n", 1552 | "from sklearn.naive_bayes import GaussianNB\n", 1553 | "from sklearn.gaussian_process.kernels import RBF\n", 1554 | "from sklearn.gaussian_process import GaussianProcessClassifier\n", 1555 | "from sklearn.neural_network import MLPClassifier\n", 1556 | "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", 1557 | "from sklearn.metrics import classification_report, accuracy_score" 1558 | ] 1559 | }, 1560 | { 1561 | "cell_type": "code", 1562 | "execution_count": 20, 1563 | "metadata": {}, 1564 | "outputs": [], 1565 | "source": [ 1566 | "from sklearn.model_selection import train_test_split\n", 1567 | "X = numerical_df.drop(['Class'], axis = 1).values\n", 1568 | "y = numerical_df['Class'].values\n", 1569 | "\n", 1570 | "#define a seed for reproducibility\n", 1571 | "seed = 1\n", 1572 | "\n", 1573 | "# Splitting data into training and testing data\n", 1574 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = seed)\n" 1575 | ] 1576 | }, 1577 | { 1578 | "cell_type": "code", 1579 | "execution_count": 21, 1580 | "metadata": {}, 1581 | "outputs": [ 1582 | { 1583 | "name": "stdout", 1584 | "output_type": "stream", 1585 | "text": [ 1586 | "K Nearest Neighbors: 0.8232142857142858 (0.11390841738440759)\n", 1587 | "Gaussian Process: 0.8732142857142857 (0.05615780426255853)\n", 1588 | "Decision Tree: 0.7482142857142857 (0.19256757361550586)\n", 1589 | "Random Forest: 0.6428571428571429 (0.1326726830071908)\n", 1590 | "Neural Net: 0.8625 (0.11792476415070755)\n", 1591 | "AddaBoost: 0.9125 (0.1125)\n", 1592 | "Naive Bayes: 0.8375 (0.1375)\n", 1593 | "SVM Linear: 0.85 (0.10897247358851683)\n", 1594 | "SVM RBF: 0.7375 (0.11792476415070755)\n", 1595 | "SVM Sigmoid: 0.5696428571428571 (0.1592092225048921)\n" 1596 | ] 1597 | } 1598 | ], 1599 | "source": [ 1600 | "# Define scoring method\n", 1601 | "scoring = 'accuracy'\n", 1602 | "# Model building to train\n", 1603 | "names = ['K Nearest Neighbors', 'Gaussian Process', 'Decision Tree', 'Random Forest', 'Neural Net', 'AddaBoost', 'Naive Bayes', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']\n", 1604 | "Classifiers = [\n", 1605 | " KNeighborsClassifier(n_neighbors = 3),\n", 1606 | " GaussianProcessClassifier(1.0*RBF(1.0)),\n", 1607 | " DecisionTreeClassifier(max_depth = 5),\n", 1608 | " RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features = 1 ),\n", 1609 | " MLPClassifier(alpha = 1),\n", 1610 | " AdaBoostClassifier(),\n", 1611 | " GaussianNB(),\n", 1612 | " svm.SVC(kernel = 'linear'),\n", 1613 | " svm.SVC(kernel = 'rbf'),\n", 1614 | " svm.SVC(kernel = 'sigmoid')\n", 1615 | " \n", 1616 | " ]\n", 1617 | "models = zip(names, Classifiers)\n", 1618 | "# import KFold\n", 1619 | "from sklearn.model_selection import KFold, cross_val_score\n", 1620 | "\n", 1621 | "names = []\n", 1622 | "result = []\n", 1623 | "for name, model in models:\n", 1624 | " kfold = KFold(n_splits = 10, random_state = 1)\n", 1625 | " cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy')\n", 1626 | " result.append(cv_results)\n", 1627 | " names.append(name)\n", 1628 | " msg = \"{0}: {1} ({2})\".format(name, cv_results.mean(), cv_results.std())\n", 1629 | " print(msg)" 1630 | ] 1631 | }, 1632 | { 1633 | "cell_type": "markdown", 1634 | "metadata": {}, 1635 | "source": [ 1636 | "## Step 4 : Model Evaluation\n", 1637 | "\n", 1638 | "Now that we will evaluate our classification algorithms using accuracy score and classification report." 1639 | ] 1640 | }, 1641 | { 1642 | "cell_type": "code", 1643 | "execution_count": 22, 1644 | "metadata": {}, 1645 | "outputs": [ 1646 | { 1647 | "name": "stdout", 1648 | "output_type": "stream", 1649 | "text": [ 1650 | "K Nearest Neighbors\n", 1651 | "0.7777777777777778\n", 1652 | " precision recall f1-score support\n", 1653 | "\n", 1654 | " 0 1.00 0.65 0.79 17\n", 1655 | " 1 0.62 1.00 0.77 10\n", 1656 | "\n", 1657 | " accuracy 0.78 27\n", 1658 | " macro avg 0.81 0.82 0.78 27\n", 1659 | "weighted avg 0.86 0.78 0.78 27\n", 1660 | "\n", 1661 | "Gaussian Process\n", 1662 | "0.8888888888888888\n", 1663 | " precision recall f1-score support\n", 1664 | "\n", 1665 | " 0 1.00 0.82 0.90 17\n", 1666 | " 1 0.77 1.00 0.87 10\n", 1667 | "\n", 1668 | " accuracy 0.89 27\n", 1669 | " macro avg 0.88 0.91 0.89 27\n", 1670 | "weighted avg 0.91 0.89 0.89 27\n", 1671 | "\n", 1672 | "Decision Tree\n", 1673 | "0.7777777777777778\n", 1674 | " precision recall f1-score support\n", 1675 | "\n", 1676 | " 0 1.00 0.65 0.79 17\n", 1677 | " 1 0.62 1.00 0.77 10\n", 1678 | "\n", 1679 | " accuracy 0.78 27\n", 1680 | " macro avg 0.81 0.82 0.78 27\n", 1681 | "weighted avg 0.86 0.78 0.78 27\n", 1682 | "\n", 1683 | "Random Forest\n", 1684 | "0.5185185185185185\n", 1685 | " precision recall f1-score support\n", 1686 | "\n", 1687 | " 0 0.70 0.41 0.52 17\n", 1688 | " 1 0.41 0.70 0.52 10\n", 1689 | "\n", 1690 | " accuracy 0.52 27\n", 1691 | " macro avg 0.56 0.56 0.52 27\n", 1692 | "weighted avg 0.59 0.52 0.52 27\n", 1693 | "\n", 1694 | "Neural Net\n", 1695 | "0.9259259259259259\n", 1696 | " precision recall f1-score support\n", 1697 | "\n", 1698 | " 0 1.00 0.88 0.94 17\n", 1699 | " 1 0.83 1.00 0.91 10\n", 1700 | "\n", 1701 | " accuracy 0.93 27\n", 1702 | " macro avg 0.92 0.94 0.92 27\n", 1703 | "weighted avg 0.94 0.93 0.93 27\n", 1704 | "\n", 1705 | "AddaBoost\n", 1706 | "0.8518518518518519\n", 1707 | " precision recall f1-score support\n", 1708 | "\n", 1709 | " 0 1.00 0.76 0.87 17\n", 1710 | " 1 0.71 1.00 0.83 10\n", 1711 | "\n", 1712 | " accuracy 0.85 27\n", 1713 | " macro avg 0.86 0.88 0.85 27\n", 1714 | "weighted avg 0.89 0.85 0.85 27\n", 1715 | "\n", 1716 | "Naive Bayes\n", 1717 | "0.9259259259259259\n", 1718 | " precision recall f1-score support\n", 1719 | "\n", 1720 | " 0 1.00 0.88 0.94 17\n", 1721 | " 1 0.83 1.00 0.91 10\n", 1722 | "\n", 1723 | " accuracy 0.93 27\n", 1724 | " macro avg 0.92 0.94 0.92 27\n", 1725 | "weighted avg 0.94 0.93 0.93 27\n", 1726 | "\n", 1727 | "SVM Linear\n", 1728 | "0.9629629629629629\n", 1729 | " precision recall f1-score support\n", 1730 | "\n", 1731 | " 0 1.00 0.94 0.97 17\n", 1732 | " 1 0.91 1.00 0.95 10\n", 1733 | "\n", 1734 | " accuracy 0.96 27\n", 1735 | " macro avg 0.95 0.97 0.96 27\n", 1736 | "weighted avg 0.97 0.96 0.96 27\n", 1737 | "\n", 1738 | "SVM RBF\n", 1739 | "0.7777777777777778\n", 1740 | " precision recall f1-score support\n", 1741 | "\n", 1742 | " 0 1.00 0.65 0.79 17\n", 1743 | " 1 0.62 1.00 0.77 10\n", 1744 | "\n", 1745 | " accuracy 0.78 27\n", 1746 | " macro avg 0.81 0.82 0.78 27\n", 1747 | "weighted avg 0.86 0.78 0.78 27\n", 1748 | "\n", 1749 | "SVM Sigmoid\n", 1750 | "0.4444444444444444\n", 1751 | " precision recall f1-score support\n", 1752 | "\n", 1753 | " 0 1.00 0.12 0.21 17\n", 1754 | " 1 0.40 1.00 0.57 10\n", 1755 | "\n", 1756 | " accuracy 0.44 27\n", 1757 | " macro avg 0.70 0.56 0.39 27\n", 1758 | "weighted avg 0.78 0.44 0.34 27\n", 1759 | "\n" 1760 | ] 1761 | } 1762 | ], 1763 | "source": [ 1764 | "#Test the algorithm on the test data set\n", 1765 | "models = zip(names, Classifiers)\n", 1766 | "for name, model in models:\n", 1767 | " model.fit(X_train, y_train)\n", 1768 | " y_pred = model.predict(X_test)\n", 1769 | " print(name)\n", 1770 | " print(accuracy_score(y_test, y_pred))\n", 1771 | " print(classification_report(y_test, y_pred))\n", 1772 | " " 1773 | ] 1774 | }, 1775 | { 1776 | "cell_type": "markdown", 1777 | "metadata": {}, 1778 | "source": [ 1779 | "## Conclusion : " 1780 | ] 1781 | }, 1782 | { 1783 | "cell_type": "markdown", 1784 | "metadata": {}, 1785 | "source": [ 1786 | "#### From above report, Support Vector Machine with 'linear' kernel performed best with F1_score = 0.96 on testing data." 1787 | ] 1788 | }, 1789 | { 1790 | "cell_type": "markdown", 1791 | "metadata": {}, 1792 | "source": [ 1793 | "### Thanks !" 1794 | ] 1795 | } 1796 | ], 1797 | "metadata": { 1798 | "kernelspec": { 1799 | "display_name": "Python 3", 1800 | "language": "python", 1801 | "name": "python3" 1802 | }, 1803 | "language_info": { 1804 | "codemirror_mode": { 1805 | "name": "ipython", 1806 | "version": 3 1807 | }, 1808 | "file_extension": ".py", 1809 | "mimetype": "text/x-python", 1810 | "name": "python", 1811 | "nbconvert_exporter": "python", 1812 | "pygments_lexer": "ipython3", 1813 | "version": "3.7.3" 1814 | } 1815 | }, 1816 | "nbformat": 4, 1817 | "nbformat_minor": 2 1818 | } 1819 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DNA Classification Using Machine Learning 🧬 2 | 3 | This project presents a methodical approach to classifying DNA sequences leveraging machine learning techniques 🤖. It includes the journey from raw data preprocessing to the evaluation of several classification algorithms, culminating in identifying the most effective model for this task. 4 | 5 | ## Overview 📖 6 | 7 | The DNA Classification Project is rooted in bioinformatics, aiming to classify DNA sequences accurately 🔍. It undertakes a detailed exploration of various machine learning algorithms to ascertain the best fit for classifying DNA sequences. 8 | 9 | ## Contents 📚 10 | 11 | ### Step 1: Importing the Dataset 📥 12 | 13 | - Introduction to and importation of the dataset that comprises DNA sequences. 14 | 15 | ### Step 2: Preprocessing the Dataset 🛠 16 | 17 | - The dataset undergoes several preprocessing steps to transform raw DNA sequences into a format amenable to machine learning algorithms. This includes encoding sequences, dealing with missing values, and normalizing data. 18 | 19 | ### Step 3: Training and Testing the Classification Algorithms 🏋️‍♂️ 20 | 21 | - **Algorithms Explored**: 22 | - **K-Nearest Neighbors (KNN)** 🚶‍♂️ 23 | - **Support Vector Machine (SVM)** ⚔ 24 | - Variants with different kernels are tested, including linear, polynomial, and radial basis function (RBF). 25 | - **Decision Trees** 🌳 26 | - **Random Forest** 🌲 27 | - **Naive Bayes** 🔮 28 | - **MultiLayer Perceptron** 🧠 29 | - **AdaBoost Classifier** 🚀 30 | 31 | ### Step 4: Model Evaluation 📊 32 | 33 | - The models are evaluated based on accuracy, precision, recall, and F1 score metrics. This step involves a critical assessment of each model's performance to identify the best-performing model. 34 | - **Conclusion**: The notebook concludes by endorsing the **Support Vector Machine** with a 'linear' kernel as the most efficient model, achieving an F1_score of 0.96 on the test data. 35 | 36 | ## Conclusion 🏁 37 | 38 | This project's findings underscore the efficacy of machine learning in the realm of DNA sequence classification, with the **Support Vector Machine (linear kernel)** standing out for its superior performance. 39 | -------------------------------------------------------------------------------- /promoters.data: -------------------------------------------------------------------------------- 1 | +,S10, tactagcaatacgcttgcgttcggtggttaagtatgtataatgcgcgggcttgtcgt 2 | +,AMPC, tgctatcctgacagttgtcacgctgattggtgtcgttacaatctaacgcatcgccaa 3 | +,AROH, gtactagagaactagtgcattagcttatttttttgttatcatgctaaccacccggcg 4 | +,DEOP2, aattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaatactaacaaactc 5 | +,LEU1_TRNA, tcgataattaactattgacgaaaagctgaaaaccactagaatgcgcctccgtggtag 6 | +,MALEFG, aggggcaaggaggatggaaagaggttgccgtataaagaaactagagtccgtttaggt 7 | +,MALK, cagggggtggaggatttaagccatctcctgatgacgcatagtcagcccatcatgaat 8 | +,RECA, tttctacaaaacacttgatactgtatgagcatacagtataattgcttcaacagaaca 9 | +,RPOB, cgacttaatatactgcgacaggacgtccgttctgtgtaaatcgcaatgaaatggttt 10 | +,RRNAB_P1, ttttaaatttcctcttgtcaggccggaataactccctataatgcgccaccactgaca 11 | +,RRNAB_P2, gcaaaaataaatgcttgactctgtagcgggaaggcgtattatgcacaccccgcgccg 12 | +,RRNDEX_P2, cctgaaattcagggttgactctgaaagaggaaagcgtaatatacgccacctcgcgac 13 | +,RRND_P1, gatcaaaaaaatacttgtgcaaaaaattgggatccctataatgcgcctccgttgaga 14 | +,RRNE_P1, ctgcaatttttctattgcggcctgcggagaactccctataatgcgcctccatcgaca 15 | +,RRNG_P1, tttatatttttcgcttgtcaggccggaataactccctataatgcgccaccactgaca 16 | +,RRNG_P2, aagcaaagaaatgcttgactctgtagcgggaaggcgtattatgcacaccgccgcgcc 17 | +,RRNX_P1, atgcatttttccgcttgtcttcctgagccgactccctataatgcgcctccatcgaca 18 | +,TNAA, aaacaatttcagaatagacaaaaactctgagtgtaataatgtagcctcgtgtcttgc 19 | +,TYRT, tctcaacgtaacactttacagcggcgcgtcatttgatatgatgcgccccgcttcccg 20 | +,ARAC, gcaaataatcaatgtggacttttctgccgtgattatagacacttttgttacgcgttt 21 | +,LACI, gacaccatcgaatggcgcaaaacctttcgcggtatggcatgatagcgcccggaagag 22 | +,MALT, aaaaacgtcatcgcttgcattagaaaggtttctggccgaccttataaccattaatta 23 | +,TRP, tctgaaatgagctgttgacaattaatcatcgaactagttaactagtacgcaagttca 24 | +,TRPP2, accggaagaaaaccgtgacattttaacacgtttgttacaaggtaaaggcgacgccgc 25 | +,THR, aaattaaaattttattgacttaggtcactaaatactttaaccaatataggcatagcg 26 | +,BIOB, ttgtcataatcgacttgtaaaccaaattgaaaagatttaggtttacaagtctacacc 27 | +,FOL, catcctcgcaccagtcgacgacggtttacgctttacgtatagtggcgacaatttttt 28 | +,UVRBP1, tccagtataatttgttggcataattaagtacgacgagtaaaattacatacctgcccg 29 | +,UVRBP3, acagttatccactattcctgtggataaccatgtgtattagagttagaaaacacgagg 30 | +,LEXA, tgtgcagtttatggttccaaaatcgccttttgctgtatatactcacagcataactgt 31 | +,PORI-L, ctgttgttcagtttttgagttgtgtataacccctcattctgatcccagcttatacgg 32 | +,SPOT42, attacaaaaagtgctttctgaactgaacaaaaaagagtaaagttagtcgcgtagggt 33 | +,M1RNA, atgcgcaacgcggggtgacaagggcgcgcaaaccctctatactgcgcgccgaagctg 34 | +,GLNS, taaaaaactaacagttgtcagcctgtcccgcttataagatcatacgccgttatacgt 35 | +,TUFB, atgcaattttttagttgcatgaactcgcatgtctccatagaatgcgcgctacttgat 36 | +,SUBB-E, ccttgaaaaagaggttgacgctgcaaggctctatacgcataatgcgccccgcaacgc 37 | +,STR, tcgttgtatatttcttgacaccttttcggcatcgccctaaaattcggcgtcctcata 38 | +,SPC, ccgtttattttttctacccatatccttgaagcggtgttataatgccgcgccctcgat 39 | +,RPOA, ttcgcatatttttcttgcaaagttgggttgagctggctagattagccagccaatctt 40 | +,RPLJ, tgtaaactaatgcctttacgtgggcggtgattttgtctacaatcttacccccacgta 41 | +,PORI-R, gatcgcacgatctgtatacttatttgagtaaattaacccacgatcccagccattctt 42 | +,ALAS, aacgcatacggtattttaccttcccagtcaagaaaacttatcttattcccacttttc 43 | +,ARABAD, ttagcggatcctacctgacgctttttatcgcaactctctactgtttctccatacccg 44 | +,BIOA, gccttctccaaaacgtgttttttgttgttaattcggtgtagacttgtaaacctaaat 45 | +,DEOP1, cagaaacgttttattcgaacatcgatctcgtcttgtgttagaattctaacatacggt 46 | +,GALP2, cactaatttattccatgtcacacttttcgcatctttgttatgctatggttatttcat 47 | +,HIS, atataaaaaagttcttgctttctaacgtgaaagtggtttaggttaaaagacatcagt 48 | +,HISJ, caaggtagaatgctttgccttgtcggcctgattaatggcacgatagtcgcatcggat 49 | +,ILVGEDA, ggccaaaaaatatcttgtactatttacaaaacctatggtaactctttaggcattcct 50 | +,LACP1, taggcaccccaggctttacactttatgcttccggctcgtatgttgtgtggaattgtg 51 | +,LPP, ccatcaaaaaaatattctcaacataaaaaactttgtgtaatacttgtaacgctacat 52 | +,TRPR, tggggacgtcgttactgatccgcacgtttatgatatgctatcgtactctttagcgag 53 | +,UVRB_P2, tcagaaatattatggtgatgaactgtttttttatccagtataatttgttggcataat 54 | -, 867, atatgaacgttgagactgccgctgagttatcagctgtgaacgacattctggcgtcta 55 | -,1169, cgaacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctc 56 | -, 802, caatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatatgcg 57 | -, 521, ttgacctactacgccagcattttggcggtgtaagctaaccattccggttgactcaat 58 | -, 918, cgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatgcag 59 | -,1481, gccaatcaatcaagaacttgaagggtggtatcagccaacagcctgacatccttcgtt 60 | -,1024, tggatggacgttcaacattgaggaaggcataacgctactacctgatgtttactccaa 61 | -,1149, gaggtggctatgtgtatgaccgaacgagtcaatcagaccgctttgactctggtatta 62 | -, 313, cgtagcgcatcagtgctttcttactgtgagtacgcaccagcgccagaggacgacgac 63 | -, 780, cgaccgaagcgagcctcgtcctcaatggcctctaaacgggtcttgaggggttttttg 64 | -,1384, ctacggtgggtacaatatgctggatggagatgcgttcacttctggtctactgactcg 65 | -, 507, atagtctcagagtcttgacctactacgccagcattttggcggtgtaagctaaccatt 66 | -, 39, aactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagcagcg 67 | -,1203, ttactgtgaacattattcgtctccgcgactacgatgagatgcctgagtgcttccgtt 68 | -, 988, tattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattga 69 | -,1171, aacgagtcaatcagaccgctttgactctggtattactgtgaacattattcgtctccg 70 | -, 753, aagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatggcc 71 | -, 630, gaagaccacgcctcgccaccgagtagacccttagagagcatgtcagcctcgacaact 72 | -, 660, ttagagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccct 73 | -,1216, tattcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcac 74 | -, 835, tgctgaaaggaggaactatatgcgctcatacgatatgaacgttgagactgccgctga 75 | -, 35, catgaactcaaggctgatacggcgagacttgcgagccttgtccttgcggtacacagc 76 | -,1218, ttcgtctccgcgactacgatgagatgcctgagtgcttccgttactggattgtcacca 77 | -, 668, catgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacgcgctt 78 | -, 413, aggaggaactacgcaaggttggaacatcggagagatgccagccagcgcacctgcacg 79 | -, 991, tctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattgagga 80 | -, 751, tgaagtgcttagcttcaaggtcacggatacgaccgaagcgagcctcgtcctcaatgg 81 | -, 850, ctatatgcgctcatacgatatgaacgttgagactgccgctgagttatcagctgtgaa 82 | -, 93, gcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccggt 83 | -,1108, atccctaatgtctacttccggtcaatccatctacgttaaccgaggtggctatgtgta 84 | -, 915, tggcgtctatcggtgaacctccggtatcaacgctggaaggtgacgctaacgcagatg 85 | -,1019, tctcgtggatggacgttcaacattgaggaaggcataacgctactacctgatgtttac 86 | -, 19, tattggcttgctcaagcatgaactcaaggctgatacggcgagacttgcgagccttgt 87 | -,1320, tagagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatg 88 | -, 91, cagcggcagcacgtttccacgcggtgagagcctcaggattcatgtcgatgtcttccg 89 | -, 217, ttacgttggcgaccgctaggactttcttgttgattttccatgcggtgttttgcgcaa 90 | -, 957, acgctaacgcagatgcagcgaacgctcggcgtattctcaacaagattaaccgacaga 91 | -, 260, ggtgttttgcgcaatgttaatcgctttgtacacctcaggcatgtaaacgtcttcgta 92 | -, 557, aaccattccggttgactcaatgagcatctcgatgcagcgtactcctacatgaataga 93 | -,1355, agacgtctctgcatggagtatgagatggactacggtgggtacaatatgctggatgga 94 | -, 244, tgttgattttccatgcggtgttttgcgcaatgttaatcgctttgtacacctcaggca 95 | -, 464, tgcacgggttgcgatagcctcagcgtattcaggtgcgagttcgatagtctcagagtc 96 | -, 296, aggcatgtaaacgtcttcgtagcgcatcagtgctttcttactgtgagtacgcaccag 97 | -, 648, ccgagtagacccttagagagcatgtcagcctcgacaacttgcataaatgctttcttg 98 | -, 230, cgctaggactttcttgttgattttccatgcggtgttttgcgcaatgttaatcgcttt 99 | -,1163, tatgaccgaacgagtcaatcagaccgctttgactctggtattactgtgaacattatt 100 | -,1321, agagggtgtactccaagaagaggaagatgaggctagacgtctctgcatggagtatga 101 | -, 663, gagagcatgtcagcctcgacaacttgcataaatgctttcttgtagacgtgccctacg 102 | -, 799, cctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaactatat 103 | -, 987, gtattctcaacaagattaaccgacagattcaatctcgtggatggacgttcaacattg 104 | -,1226, cgcgactacgatgagatgcctgagtgcttccgttactggattgtcaccaaggcttcc 105 | -, 794, ctcgtcctcaatggcctctaaacgggtcttgaggggttttttgctgaaaggaggaac 106 | -,1442, taacattaataaataaggaggctctaatggcactcattagccaatcaatcaagaact 107 | --------------------------------------------------------------------------------