├── Capstone Project - Extract Phishing.ipynb ├── Capstone Project - Feature Importances.ipynb ├── Capstone Project - Preproccess + Feature Extraction.ipynb ├── Capstone Project - Testing Supervised, Phishing-Ham.ipynb ├── Capstone Project - Testing Supervised, Spam-Ham.ipynb ├── Capstone Project - Testing Unsupervised, Phishing-Ham.ipynb ├── Capstone Project - Testing Unsupervised, Spam-Ham.ipynb ├── Capstone project - Extract HamSpam.ipynb ├── README.md └── preprocessed_spam_ham_phishing.csv /Capstone Project - Extract Phishing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import shutil\n", 11 | "from os import listdir, mkdir, path\n", 12 | "from pathlib import Path\n", 13 | "import sys\n", 14 | "from email.parser import HeaderParser\n", 15 | "import collections\n", 16 | "import matplotlib.pyplot as plt\n", 17 | "from matplotlib.pyplot import figure\n", 18 | "import time\n", 19 | "from collections import Counter" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 17, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "parser = HeaderParser()\n", 29 | "columns = ['received1',\n", 30 | "'received2',\n", 31 | "'received3',\n", 32 | "'received4',\n", 33 | "'received5',\n", 34 | "'received6',\n", 35 | "'received7',\n", 36 | "'received8',\n", 37 | "'hops',\n", 38 | "'subject',\n", 39 | "'date',\n", 40 | "'message-id',\n", 41 | "'from',\n", 42 | "'return-path',\n", 43 | "'to',\n", 44 | "'content-type',\n", 45 | "'mime-version',\n", 46 | "'x-mailer',\n", 47 | "'content-transfer-encoding',\n", 48 | "'x-mimeole',\n", 49 | "'x-priority',\n", 50 | "'list-id',\n", 51 | "'lines',\n", 52 | "'x-virus-scanned',\n", 53 | "'status',\n", 54 | "'content-length',\n", 55 | "'precedence',\n", 56 | "'delivered-to',\n", 57 | "'list-unsubscribe',\n", 58 | "'list-subscribe',\n", 59 | "'list-post',\n", 60 | "'list-help',\n", 61 | "'x-msmail-priority',\n", 62 | "'x-spam-status',\n", 63 | "'sender',\n", 64 | "'errors-to',\n", 65 | "'x-beenthere',\n", 66 | "'list-archive',\n", 67 | "'reply-to',\n", 68 | "'x-mailman-version',\n", 69 | "'x-miltered',\n", 70 | "'x-uuid',\n", 71 | "'x-virus-status',\n", 72 | "'x-spam-level',\n", 73 | "'x-spam-checker-version',\n", 74 | "'references',\n", 75 | "'in-reply-to',\n", 76 | "'user-agent',\n", 77 | "'thread-index',\n", 78 | "'cc',\n", 79 | "'received-spf',\n", 80 | "'x-original-to',\n", 81 | "'content-disposition',\n", 82 | "'mailing-list',\n", 83 | "'x-spam-check-by',\n", 84 | "'domainkey-signature',\n", 85 | "'importance',\n", 86 | "'x-mailing-list',\n", 87 | "'label']\n", 88 | "\n", 89 | "list_of_rows = []\n", 90 | "\n", 91 | "\n", 92 | "def addEmailsToDict(email_list):\n", 93 | " global parser, list_of_rows\n", 94 | " \n", 95 | " # The label for phishing, set to '2'\n", 96 | " label = 2\n", 97 | " \n", 98 | " \n", 99 | " # Read the full email content\n", 100 | " for email in email_list:\n", 101 | " row_dict = {}\n", 102 | " \n", 103 | " # Parse the email content\n", 104 | " h = parser.parsestr(email)\n", 105 | "\n", 106 | " # Parse recieved field\n", 107 | " received_list = h.get_all('received')\n", 108 | " hops = 0\n", 109 | " if received_list is not None:\n", 110 | " hops = len(received_list)\n", 111 | " col_name_recieved = 'received'\n", 112 | "\n", 113 | " for inx, received_field in enumerate(received_list):\n", 114 | " col = col_name_recieved + str(inx+1)\n", 115 | " row_dict[col] = received_field\n", 116 | "\n", 117 | "\n", 118 | " # Make everything lowercase to avoid issues\n", 119 | " features_lower_case = [x.lower() for x in h.keys()]\n", 120 | "\n", 121 | " # Parse everything else\n", 122 | " new_row = dict(zip(features_lower_case, h.values()))\n", 123 | " new_row['hops'] = hops\n", 124 | "\n", 125 | "\n", 126 | " for key,value in new_row.items():\n", 127 | " if key in columns:\n", 128 | " row_dict['label'] = label\n", 129 | " row_dict[key] = value\n", 130 | "\n", 131 | "\n", 132 | " list_of_rows.append(row_dict)\n", 133 | "\n", 134 | "def main():\n", 135 | " global list_of_rows\n", 136 | "\n", 137 | " file_path = 'Phishing Dataset/phishing-2017.txt'\n", 138 | " \n", 139 | " # Read the full email content\n", 140 | " emailStr = ''\n", 141 | " try:\n", 142 | " with open(file_path, encoding='latin_1') as emailFile:\n", 143 | " for line in emailFile:\n", 144 | " emailStr += line\n", 145 | " except UnicodeDecodeError:\n", 146 | " print('Unicode Error!')\n", 147 | " \n", 148 | " email_list = emailStr.split('\\nFrom jose@monkey.org')\n", 149 | " #print(email_list[1])\n", 150 | " \n", 151 | " addEmailsToDict(email_list)\n", 152 | " \n", 153 | " # Create the dataframe\n", 154 | " df = pd.DataFrame(list_of_rows)\n", 155 | " \n", 156 | " final_columns = []\n", 157 | " for col in columns:\n", 158 | " if col in df.columns:\n", 159 | " final_columns.append(col)\n", 160 | " \n", 161 | " df = df[final_columns]\n", 162 | " \n", 163 | " # Output the dataframe to a .csv file\n", 164 | " df.to_csv('phishing_out_2017V2.csv', index=False)\n", 165 | "\n", 166 | "if __name__ == '__main__':\n", 167 | " main()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "**Show the most common header fields:**" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 4, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "('date', 1012)\n", 187 | "('from', 1012)\n", 188 | "('subject', 1012)\n", 189 | "('status', 1010)\n", 190 | "('return-path', 1009)\n", 191 | "('delivered-to', 1009)\n", 192 | "('x-fda', 1009)\n", 193 | "('x-spam-summary', 1009)\n", 194 | "('x-he-tag', 1009)\n", 195 | "('received', 1009)\n", 196 | "('content-type', 1008)\n", 197 | "('x-status', 1007)\n", 198 | "('x-keywords', 1007)\n", 199 | "('x-uid', 1007)\n", 200 | "('x-filterd-recvd-size', 1000)\n", 201 | "('mime-version', 1000)\n", 202 | "('to', 968)\n", 203 | "('message-id', 837)\n", 204 | "('content-transfer-encoding', 320)\n", 205 | "('dkim-signature', 259)\n", 206 | "('reply-to', 143)\n", 207 | "('x-authenticated-sender', 137)\n", 208 | "('x-antiabuse', 133)\n", 209 | "('x-get-message-sender-via', 132)\n", 210 | "('x-originating-ip', 120)\n", 211 | "('x-priority', 108)\n", 212 | "('thread-index', 105)\n", 213 | "('x-source', 105)\n", 214 | "('x-source-args', 105)\n", 215 | "('x-source-dir', 105)\n", 216 | "('thread-topic', 101)\n", 217 | "('content-language', 93)\n", 218 | "('x-ms-has-attach', 93)\n", 219 | "('x-ms-tnef-correlator', 93)\n", 220 | "('accept-language', 92)\n", 221 | "('x-mailer', 86)\n", 222 | "('x-virus-scanned', 83)\n", 223 | "('x-sg-eid', 71)\n", 224 | "('x-php-originating-script', 64)\n", 225 | "('authentication-results', 61)\n", 226 | "('received-spf', 45)\n", 227 | "('references', 42)\n", 228 | "('x-msmail-priority', 40)\n", 229 | "('in-reply-to', 39)\n", 230 | "('x-spam-status', 34)\n", 231 | "('x-originalarrivaltime', 33)\n", 232 | "('importance', 33)\n", 233 | "('x-php-script', 32)\n", 234 | "('x-microsoft-antispam', 29)\n", 235 | "('x-originatororg', 29)\n", 236 | "('x-ms-exchange-crosstenant-originalarrivaltime', 29)\n", 237 | "('x-ms-exchange-crosstenant-fromentityheader', 29)\n", 238 | "('x-ms-exchange-transport-crosstenantheadersstamped', 29)\n", 239 | "('x-forefront-antispam-report', 28)\n", 240 | "('x-ms-office365-filtering-correlation-id', 28)\n", 241 | "('x-microsoft-antispam-prvs', 28)\n", 242 | "('x-microsoft-exchange-diagnostics', 26)\n", 243 | "('x-ms-publictraffictype', 26)\n", 244 | "('priority', 26)\n", 245 | "('x-exchange-antispam-report-cfa-test', 25)\n", 246 | "('x-forefront-prvs', 25)\n", 247 | "('x-ms-exchange-crosstenant-id', 25)\n", 248 | "('spamdiagnosticoutput', 24)\n", 249 | "('x-mimeole', 23)\n", 250 | "('x-ms-traffictypediagnostic', 23)\n", 251 | "('x-exchange-antispam-report-test', 22)\n", 252 | "('x-google-dkim-signature', 20)\n", 253 | "('x-gm-message-state', 20)\n", 254 | "('x-received', 20)\n", 255 | "('domainkey-signature', 20)\n", 256 | "('list-unsubscribe', 19)\n", 257 | "('sender', 18)\n", 258 | "('x-antivirus', 16)\n", 259 | "('x-microsoft-antispam-message-info', 16)\n", 260 | "('x-ms-exchange-crosstenant-network-message-id', 16)\n", 261 | "('x-ironport-av', 14)\n", 262 | "('x-antivirus-status', 14)\n", 263 | "('x-clientproxiedby', 14)\n", 264 | "('x-cm-score', 13)\n", 265 | "('dkim-filter', 13)\n", 266 | "('x-sender', 13)\n", 267 | "('user-agent', 13)\n", 268 | "('x-ms-exchange-transport-fromentityheader', 12)\n", 269 | "('x-return-path', 11)\n", 270 | "('x-spam-level', 11)\n", 271 | "('x-outgoing-spam-status', 11)\n", 272 | "('x-google-smtp-source', 11)\n", 273 | "('x-rspamd-queue-id', 11)\n", 274 | "('x-rspamd-server', 11)\n", 275 | "('x-relaying-domain', 10)\n", 276 | "('x-eopattributedmessage', 10)\n", 277 | "('x-authority-analysis', 10)\n", 278 | "('x-ms-exchange-senderadcheck', 10)\n", 279 | "('x-report-abuse-to', 9)\n", 280 | "('x-ms-exchange-crosstenant-originalattributedtenantconnectingip', 9)\n", 281 | "('x-brightmail-tracker', 9)\n", 282 | "('x-stat-signature', 9)\n", 283 | "('x-spam-processed', 8)\n", 284 | "('x-mdremoteip', 8)\n", 285 | "('x-envelope-from', 8)\n", 286 | "('x-mdaemon-deliver-to', 8)\n", 287 | "('x-spam-flag', 8)\n", 288 | "('x-spam-checker-version', 8)\n", 289 | "('x-recommended-action', 8)\n", 290 | "('x-filter-id', 8)\n", 291 | "('x-ct-class', 8)\n", 292 | "('x-ct-score', 8)\n", 293 | "('x-ct-refid', 8)\n", 294 | "('x-ct-spam', 8)\n", 295 | "('x-tm-as-result', 8)\n", 296 | "('bcc', 8)\n", 297 | "('x-ironport-anti-spam-filtered', 7)\n", 298 | "('x-ironport-anti-spam-result', 7)\n", 299 | "('x-spam-score', 7)\n", 300 | "('x-proofpoint-virus-version', 7)\n", 301 | "('x-tm-as-product-ver', 7)\n", 302 | "('x-ipas-result', 6)\n", 303 | "('sensitivity', 6)\n", 304 | "('x-virus-status', 6)\n", 305 | "('x-scanned-by', 6)\n", 306 | "('x-no-relay', 6)\n", 307 | "('x-spamexperts-domain', 6)\n", 308 | "('x-spamexperts-username', 6)\n", 309 | "('x-spamexperts-outgoing-class', 6)\n", 310 | "('x-spamexperts-outgoing-evidence', 6)\n", 311 | "('x-auditid', 6)\n", 312 | "('x-tmase-matchedrid', 6)\n", 313 | "('x-entity-id', 6)\n", 314 | "('acceptlanguage', 5)\n", 315 | "('x-proofpoint-spam-reason', 5)\n", 316 | "('x-id', 5)\n", 317 | "('x-toi-msgid', 5)\n", 318 | "('x-imss-scan-details', 5)\n", 319 | "('x-mailgun-sending-ip', 5)\n", 320 | "('x-mailgun-sid', 5)\n", 321 | "('x-tmase-result', 5)\n", 322 | "('x-tmase-version', 5)\n", 323 | "('x-easynet-es-outgoing-spam-score', 5)\n", 324 | "('x-outgoing-spam-report', 5)\n", 325 | "('x-outgoing-spamscan-signature', 5)\n", 326 | "('x-easynet-bounce-key', 5)\n", 327 | "('content-class', 4)\n", 328 | "('x_cmae_category', 4)\n", 329 | "('x-cnfs-analysis', 4)\n", 330 | "('x-authed-username', 4)\n", 331 | "('x-mailer-lid', 4)\n", 332 | "('x-mailer-recptid', 4)\n", 333 | "('x-mailer-sid', 4)\n", 334 | "('x-mailer-sent-by', 4)\n", 335 | "('spamdiagnosticmetadata', 4)\n", 336 | "('x-php-filename', 4)\n", 337 | "('x-tm-as-user-approved-sender', 4)\n", 338 | "('x-tm-as-user-blocked-sender', 4)\n", 339 | "('x-env-sender', 4)\n", 340 | "('x-msg-ref', 4)\n", 341 | "('x-starscan-received', 4)\n", 342 | "('x-starscan-version', 4)\n", 343 | "('x-kse-serverinfo', 4)\n", 344 | "('x-kse-antivirus-interceptor-info', 4)\n", 345 | "('x-kse-antivirus-info', 4)\n", 346 | "('precedence', 4)\n", 347 | "('x-mailscanner-from', 4)\n", 348 | "('x-sasi-rcode', 4)\n", 349 | "('x-asg-debug-id', 4)\n", 350 | "('x-barracuda-envelope-from', 4)\n", 351 | "('x-asg-orig-subj', 4)\n", 352 | "('x-barracuda-connect', 4)\n", 353 | "('x-barracuda-start-time', 4)\n", 354 | "('x-barracuda-url', 4)\n", 355 | "('x-barracuda-scan-msg-size', 4)\n", 356 | "('x-barracuda-brts-status', 4)\n", 357 | "('x-barracuda-spam-score', 4)\n", 358 | "('x-barracuda-spam-status', 4)\n", 359 | "('x-barracuda-spam-report', 4)\n", 360 | "('x-sg-id', 4)\n", 361 | "('x-spam-scanned', 4)\n", 362 | "('x-locaweb-id', 4)\n", 363 | "('x-he-dkim-result', 4)\n", 364 | "('x-imap', 3)\n", 365 | "('x-cyberoam-smtpxy-version', 3)\n", 366 | "('x-cyberoam-av-policy', 3)\n", 367 | "('x-yahoo-newman-property', 3)\n", 368 | "('x-yahoo-newman-id', 3)\n", 369 | "('x-ymail-osg', 3)\n", 370 | "('x-msw-jemd-refid', 3)\n", 371 | "('disposition-notification-to', 3)\n", 372 | "('x-viruschecked', 3)\n", 373 | "('x-auth-id', 3)\n", 374 | "('x-footer', 3)\n", 375 | "('list-id', 3)\n", 376 | "('x-csa-complaints', 3)\n", 377 | "('x-authuser', 3)\n", 378 | "('x-mdav-processed', 3)\n", 379 | "('x-mailscanner-id', 3)\n", 380 | "('x-symc-ess-client-auth', 3)\n", 381 | "('feedback-id', 3)\n", 382 | "('recipient-id', 3)\n", 383 | "('x-debug', 3)\n", 384 | "('x-email-rejection-mode', 3)\n", 385 | "('x-api-host', 3)\n", 386 | "('site-id', 3)\n", 387 | "('x-bounceemailversion', 3)\n", 388 | "('x-pmx-version', 3)\n", 389 | "('x-crosspremisesheadersfilteredbysendconnector', 3)\n", 390 | "('x-organizationheaderspreserved', 3)\n", 391 | "('x-dkim', 3)\n", 392 | "('x-tm-as-gconf', 3)\n", 393 | "('x-tmase-snap-result', 3)\n", 394 | "('x-barracuda-effective-source-ip', 3)\n", 395 | "('x-barracuda-apparent-source-ip', 3)\n", 396 | "('mail-reply-to', 3)\n", 397 | "('x-locaweb-id2', 3)\n", 398 | "('x-message-uid', 3)\n", 399 | "('x-account-uid', 3)\n", 400 | "('x-envid', 3)\n", 401 | "('arc-seal', 3)\n", 402 | "('arc-message-signature', 3)\n", 403 | "('arc-authentication-results', 3)\n", 404 | "('x-ms-oob-tlc-oobclassifiers', 3)\n", 405 | "('x-ms-exchange-antispam-messagedata', 3)\n", 406 | "('x-ms-exchange-transport-forked', 3)\n", 407 | "('x-ms-exchange-crosstenant-authas', 3)\n", 408 | "('x-ms-exchange-crosstenant-authsource', 3)\n", 409 | "('x-ms-exchange-crosstenant-mailboxtype', 3)\n", 410 | "('x-ms-exchange-crosstenant-userprincipalname', 3)\n", 411 | "('content-length', 2)\n", 412 | "('ironport-phdr', 2)\n", 413 | "('x-vpm-host', 2)\n", 414 | "('x-vpm-group-id', 2)\n", 415 | "('x-vpm-msg-id', 2)\n", 416 | "('x-vpm-enc-regime', 2)\n", 417 | "('x-vpm-is-hybrid', 2)\n", 418 | "('x-exclaimer-md-config', 2)\n", 419 | "('x-mb-message-source', 2)\n", 420 | "('x-mb-message-type', 2)\n", 421 | "('x-aol-sid', 2)\n", 422 | "('x-netcomp-mailscanner-information', 2)\n", 423 | "('x-netcomp-mailscanner-id', 2)\n", 424 | "('x-netcomp-mailscanner', 2)\n", 425 | "('x-netcomp-mailscanner-spamscore', 2)\n", 426 | "('x-netcomp-mailscanner-from', 2)\n", 427 | "('content-disposition', 2)\n", 428 | "('x-canit-geo', 2)\n", 429 | "('x-canitpro-stream', 2)\n", 430 | "('x-canit-stats-id', 2)\n", 431 | "('return-receipt-to', 2)\n", 432 | "('x-sa-exim-connect-ip', 2)\n", 433 | "('x-sa-exim-mail-from', 2)\n", 434 | "('x-sa-exim-version', 2)\n", 435 | "('x-sa-exim-scanned', 2)\n", 436 | "('x-ms-office365-filtering-ht', 2)\n", 437 | "('x-mail', 2)\n", 438 | "('x-spam', 2)\n", 439 | "('x-kse-attachment-filter-scan-result', 2)\n", 440 | "('x-mdhelo', 2)\n", 441 | "('x-mdarrival-date', 2)\n", 442 | "('x-cav-result', 2)\n", 443 | "('x-wb-res', 2)\n", 444 | "('x-msw-jemd-newsletter', 2)\n", 445 | "('x-claimtheweb-mailscanner-information', 2)\n", 446 | "('x-claimtheweb-mailscanner-id', 2)\n", 447 | "('x-claimtheweb-mailscanner', 2)\n", 448 | "('x-claimtheweb-mailscanner-spamcheck', 2)\n", 449 | "('x-claimtheweb-mailscanner-from', 2)\n", 450 | "('x-mimectl', 2)\n", 451 | "('x-proofpoint-virus-status', 2)\n", 452 | "('x-me-helo', 2)\n", 453 | "('x-me-date', 2)\n", 454 | "('x-me-ip', 2)\n", 455 | "('x-ctch-refid', 2)\n", 456 | "('x-ctch-vod', 2)\n", 457 | "('x-ctch-spam', 2)\n", 458 | "('x-virus-checked', 2)\n", 459 | "('x-ms-exchange-messagesentrepresentingtype', 2)\n", 460 | "('x-mc-unique', 2)\n", 461 | "('x-mimecast-spam-score', 2)\n", 462 | "('x-intloopheader', 2)\n", 463 | "('x-smtp33-mailscanner-information', 2)\n", 464 | "('x-smtp33-mailscanner', 2)\n", 465 | "('x-mlf-version', 2)\n", 466 | "('x-mlf-license', 2)\n", 467 | "('x-mlf-uniqueid', 2)\n", 468 | "('x-bounce-tracking-info', 2)\n", 469 | "('x-authority-reason', 2)\n", 470 | "('x-bwhitelist', 2)\n", 471 | "('x-source-ip', 2)\n", 472 | "('x-source-l', 2)\n", 473 | "('x-exim-id', 2)\n", 474 | "('x-source-sender', 2)\n", 475 | "('x-source-auth', 2)\n", 476 | "('x-email-count', 2)\n", 477 | "('x-source-cap', 2)\n", 478 | "('x-local-domain', 2)\n", 479 | "('ironport-sdr', 2)\n", 480 | "('x-ses-outgoing', 2)\n", 481 | "('x-barracuda-rbl-trusted-forwarder', 2)\n", 482 | "('x-barracuda-rbl-ip', 2)\n", 483 | "('x-barracuda-encrypted', 2)\n", 484 | "('x-kse-attachment-filter-triggered-rules', 2)\n", 485 | "('x-kse-attachment-filter-triggered-filters', 2)\n", 486 | "('x-kse-bulkmessagesfiltering-scan-result', 2)\n", 487 | "('x-classification-id', 2)\n", 488 | "('x-spamd-result', 2)\n", 489 | "('x-mimetrack', 1)\n", 490 | "('x-pmwin-version', 1)\n", 491 | "('x-msw-jemd-spam', 1)\n", 492 | "('x-rc-from', 1)\n", 493 | "('x-dlp-outbound', 1)\n", 494 | "('x-mag-outbound', 1)\n", 495 | "('x-modus-blacklist', 1)\n", 496 | "('x-modus-trusted', 1)\n", 497 | "('x-modus-spam-version', 1)\n", 498 | "('x-modus-audit', 1)\n", 499 | "('x-spam-ctch-refid', 1)\n", 500 | "('x-aol-global-disposition', 1)\n", 501 | "('x-aol-reroute', 1)\n", 502 | "('x-suspect-keyword', 1)\n", 503 | "('x-hermes-message-id', 1)\n", 504 | "('x-hm-ut', 1)\n", 505 | "('x-bayes-prob', 1)\n", 506 | "('x-microsoft-exchange-diagnostics-untrusted', 1)\n", 507 | "('x-microsoft-antispam-untrusted', 1)\n", 508 | "('x-forefront-antispam-report-untrusted', 1)\n", 509 | "('x-ms-exchange-transport-crosstenantheadersstripped', 1)\n", 510 | "('x-dkimresult-test', 1)\n", 511 | "('x-nai-spam-flag', 1)\n", 512 | "('x-nai-spam-level', 1)\n", 513 | "('x-nai-spam-threshold', 1)\n", 514 | "('x-nai-spam-score', 1)\n", 515 | "('x-nai-spam-version', 1)\n", 516 | "('x-disclaimer', 1)\n", 517 | "('x-imss-dkim-white-list', 1)\n", 518 | "('x-ait-server-origin', 1)\n", 519 | "('x-ait-path-origin', 1)\n", 520 | "('x-ait-rev', 1)\n", 521 | "('x-marr-info', 1)\n", 522 | "('x-kerio-anti-spam', 1)\n", 523 | "('x-antivirus-mydomain-mail-from', 1)\n", 524 | "('x-antivirus-mydomain', 1)\n", 525 | "('x-antivirus-mydomain-message-id', 1)\n", 526 | "('x-job', 1)\n", 527 | "('x-proofpoint-spam-details', 1)\n", 528 | "('x-hospedandoantispam-domain', 1)\n", 529 | "('x-hospedandoantispam-username', 1)\n", 530 | "('x-hospedandoantispam-outgoing-class', 1)\n", 531 | "('x-hospedandoantispam-outgoing-evidence', 1)\n", 532 | "('x-dnsrbl', 1)\n", 533 | "('organization', 1)\n", 534 | "('x-xm-spf', 1)\n", 535 | "('x-spam-report', 1)\n", 536 | "('x-spam-dcc', 1)\n", 537 | "('x-spam-combo', 1)\n", 538 | "('x-spam-relay-country', 1)\n", 539 | "('x-spam-timing', 1)\n", 540 | "('x-disclaimer-applied', 1)\n", 541 | "('x-7fa49cb5', 1)\n", 542 | "('x-mailru-sender', 1)\n", 543 | "('x-mras', 1)\n", 544 | "('x-kse-antispam-interceptor-info', 1)\n", 545 | "('x-kse-dlp-scaninfo', 1)\n", 546 | "('x-proofpoint-virus-details', 1)\n", 547 | "('content-description', 1)\n", 548 | "('x-mailscanner', 1)\n", 549 | "('x-mailscanner-spamscore', 1)\n", 550 | "('x-sussex', 1)\n", 551 | "('x-sussex-transport', 1)\n", 552 | "('x-rule-processed-ssl01', 1)\n", 553 | "('x-msfbl', 1)\n", 554 | "('x-wb-msg-id', 1)\n", 555 | "('x-assp-version', 1)\n", 556 | "('x-assp-id', 1)\n", 557 | "('x-assp-session', 1)\n", 558 | "('x-assp-envelope-from', 1)\n", 559 | "('x-assp-intended-for', 1)\n", 560 | "('x-assp-original-subject', 1)\n", 561 | "('x-assp-client-tls', 1)\n", 562 | "('x-report-abuse', 1)\n", 563 | "('x-mandrill-user', 1)\n", 564 | "('x-msw-jemd-malware', 1)\n", 565 | "('x-me-auth', 1)\n", 566 | "('x-country-code', 1)\n", 567 | "('x-cache-id', 1)\n", 568 | "('message-context', 1)\n", 569 | "('x-wum-signatureadded', 1)\n", 570 | "('x-message-size', 1)\n", 571 | "('x-savecopy', 1)\n", 572 | "('x-national-code', 1)\n", 573 | "('x-cache-entry', 1)\n", 574 | "('x-wum-channeltype', 1)\n", 575 | "('x-wum-nature', 1)\n", 576 | "('x-wum-from', 1)\n", 577 | "('x-wum-to', 1)\n", 578 | "('x-wum-replyto', 1)\n", 579 | "('x-ums', 1)\n", 580 | "('x-csc', 1)\n", 581 | "('x-cha', 1)\n", 582 | "('x-ctch-score', 1)\n", 583 | "('x-ctch-rules', 1)\n", 584 | "('x-ctch-flags', 1)\n", 585 | "('x-ctch-scorecust', 1)\n", 586 | "('x-masterwebnetwork-domain', 1)\n", 587 | "('x-masterwebnetwork-username', 1)\n", 588 | "('x-masterwebnetwork-outgoing-class', 1)\n", 589 | "('x-masterwebnetwork-outgoing-evidence', 1)\n", 590 | "('x-smtp10-mailscanner-information', 1)\n", 591 | "('x-smtp10-mailscanner-id', 1)\n", 592 | "('x-smtp10-mailscanner', 1)\n", 593 | "('x-cloudmilter-processed', 1)\n", 594 | "('x-originating-smarthost01a-ip', 1)\n", 595 | "('x-mdav-result', 1)\n", 596 | "('x-ctch-error', 1)\n", 597 | "('x-authenticated', 1)\n", 598 | "('x-ratelimit', 1)\n", 599 | "('x-outbound-ip', 1)\n", 600 | "('x-env-from', 1)\n", 601 | "('x-proto', 1)\n", 602 | "('x-revdns', 1)\n", 603 | "('x-helo', 1)\n", 604 | "('x-tls', 1)\n", 605 | "('x-authenticated_id', 1)\n", 606 | "('x-policysmart', 1)\n", 607 | "('x-antispam-training-forget', 1)\n", 608 | "('x-antispam-training-nonspam', 1)\n", 609 | "('x-antispam-training-phish', 1)\n", 610 | "('x-antispam-training-spam', 1)\n", 611 | "('x-incomingtopheadermarker', 1)\n", 612 | "('x-tmn', 1)\n", 613 | "('x-incomingheadercount', 1)\n", 614 | "('x-ms-exchange-slblob-mailprops', 1)\n", 615 | "('x-ms-exchange-crosstenant-rms-persistedconsumerorg', 1)\n", 616 | "('x-kse-antispam-outbound-interceptor-info', 1)\n", 617 | "('x-kse-antispam-version', 1)\n", 618 | "('x-kse-antispam-status', 1)\n", 619 | "('x-kse-antispam-method', 1)\n", 620 | "('x-kse-antispam-rate', 1)\n", 621 | "('x-kse-antispam-info', 1)\n", 622 | "('x-kse-antiphishing-info', 1)\n", 623 | "('x-kse-antiphishing-method', 1)\n", 624 | "('x-kse-antiphishing-bases', 1)\n", 625 | "('x-mailster', 1)\n", 626 | "('x-mailster-campaign', 1)\n", 627 | "('x-mailster-id', 1)\n", 628 | "('list-unsubscribe-post', 1)\n", 629 | "('x-message-id', 1)\n", 630 | "('x-ort-llc-mailscanner-information', 1)\n", 631 | "('x-ort-llc-mailscanner-id', 1)\n", 632 | "('x-ort-llc-mailscanner', 1)\n", 633 | "('x-ort-llc-mailscanner-from', 1)\n", 634 | "('x-klms-rule-id', 1)\n", 635 | "('x-klms-message-action', 1)\n", 636 | "('x-klms-antispam-lua-profiles', 1)\n", 637 | "('x-klms-antispam-version', 1)\n", 638 | "('x-klms-antispam-envelope-from', 1)\n", 639 | "('x-klms-antispam-rate', 1)\n", 640 | "('x-klms-antispam-status', 1)\n", 641 | "('x-klms-antispam-method', 1)\n", 642 | "('x-klms-antispam-info', 1)\n", 643 | "('x-klms-antispam-interceptor-info', 1)\n", 644 | "('x-klms-antiphishing', 1)\n", 645 | "('x-klms-antivirus', 1)\n", 646 | "('x-klms-antivirus-status', 1)\n", 647 | "('x-additional-header', 1)\n", 648 | "('x-virus-found', 1)\n", 649 | "('x-lookup-warning', 1)\n", 650 | "('x-ovh-tracer-id', 1)\n", 651 | "('x-vr-spamstate', 1)\n", 652 | "('x-vr-spamscore', 1)\n", 653 | "('x-vr-spamcause', 1)\n", 654 | "('x-tm-as-smtp', 1)\n", 655 | "('x-tm-as-ers', 1)\n", 656 | "('x-terrace-dummysubject', 1)\n", 657 | "('x-terrace-spammark', 1)\n", 658 | "('x-terrace-classid', 1)\n", 659 | "('x-c2processedorg', 1)\n", 660 | "('x-spamh-filter', 1)\n", 661 | "('x-spamh-originatingip', 1)\n", 662 | "('x-authentication-warning', 1)\n", 663 | "('x-mj-mid', 1)\n", 664 | "('x-mj-smtpguid', 1)\n", 665 | "('x-hmdnsgroup-mailscanner-information', 1)\n", 666 | "('x-hmdnsgroup-mailscanner-id', 1)\n", 667 | "('x-hmdnsgroup-mailscanner', 1)\n", 668 | "('x-hmdnsgroup-mailscanner-spamcheck', 1)\n", 669 | "('x-hmdnsgroup-mailscanner-from', 1)\n", 670 | "('x-forwarded-for', 1)\n", 671 | "('x-auth', 1)\n", 672 | "('x-cmae-envelope', 1)\n", 673 | "('x-tamu-auth-id', 1)\n", 674 | "('x-tamu-senderip', 1)\n", 675 | "('x-tamu-auth', 1)\n", 676 | "('x-clx-shades', 1)\n", 677 | "('x-clx-response', 1)\n", 678 | "('x-ironport-remoteip', 1)\n", 679 | "('x-ironport-mid', 1)\n", 680 | "('x-ironport-reputation', 1)\n", 681 | "('x-ironport-listener', 1)\n", 682 | "('x-ironport-sendergroup', 1)\n", 683 | "('x-ironport-mailflowpolicy', 1)\n", 684 | "('x-cloudmark-score', 1)\n", 685 | "('x-unknown-null', 1)\n", 686 | "('x-synaq-pinpoint-information', 1)\n", 687 | "('x-synaq-pinpoint-id', 1)\n", 688 | "('x-synaq-pinpoint', 1)\n", 689 | "('x-synaq-pinpoint-spamcheck', 1)\n", 690 | "('x-synaq-pinpoint-spamscore', 1)\n", 691 | "('x-pinpoint-from', 1)\n", 692 | "('x-acl-warn', 1)\n", 693 | "('x-bypsheader', 1)\n", 694 | "('x-smscore', 1)\n", 695 | "('x-lcid', 1)\n", 696 | "('x-sm_envelopefrom', 1)\n", 697 | "('x-sm_received_on', 1)\n", 698 | "('x-cpanel-mailscanner-information', 1)\n", 699 | "('x-cpanel-mailscanner-id', 1)\n", 700 | "('x-cpanel-mailscanner', 1)\n", 701 | "('x-cpanel-mailscanner-spamcheck', 1)\n", 702 | "('x-cpanel-mailscanner-spamscore', 1)\n", 703 | "('x-cpanel-mailscanner-from', 1)\n", 704 | "('x-sender-id', 1)\n", 705 | "('x-type', 1)\n", 706 | "('x-toi-expurgateid', 1)\n", 707 | "('x-amp-result', 1)\n", 708 | "('x-amp-file-uploaded', 1)\n", 709 | "('x-ctch-av-threatscount', 1)\n", 710 | "('x-en-sp-dir', 1)\n", 711 | "('x-en-sp-sq', 1)\n", 712 | "('disposition-notification-options', 1)\n", 713 | "('x-msg-eid', 1)\n", 714 | "('cc', 1)\n", 715 | "('x-dropbox-message-id', 1)\n", 716 | "('x-mn-spam-dtl', 1)\n", 717 | "('x-biz-relay', 1)\n", 718 | "('x-default-received-spf', 1)\n", 719 | "TOTAL EMAIL COUNT: 1012\n" 720 | ] 721 | }, 722 | { 723 | "data": { 724 | "image/png": "\n", 725 | "text/plain": [ 726 | "
" 727 | ] 728 | }, 729 | "metadata": { 730 | "needs_background": "light" 731 | }, 732 | "output_type": "display_data" 733 | } 734 | ], 735 | "source": [ 736 | "parser = HeaderParser()\n", 737 | "dict_header_features = {}\n", 738 | "total_emails = 0\n", 739 | "\n", 740 | "\n", 741 | "def addEmailsToDict(email_list):\n", 742 | " global parser, dict_header_features, total_emails\n", 743 | " \n", 744 | " # Read the full email content\n", 745 | " for email in email_list:\n", 746 | " \n", 747 | " total_emails += 1\n", 748 | " h = parser.parsestr(email)\n", 749 | " features_lower_case = [x.lower() for x in h.keys()]\n", 750 | " features_dict = Counter(features_lower_case)\n", 751 | "\n", 752 | " for k,v in features_dict.items():\n", 753 | " if k in dict_header_features:\n", 754 | " dict_header_features[k] += 1\n", 755 | " else:\n", 756 | " dict_header_features[k] = 1\n", 757 | "\n", 758 | " \n", 759 | "def main():\n", 760 | " global dict_header_features, total_emails\n", 761 | " \n", 762 | " file_path_list = ['Phishing Dataset/phishing-2017.txt', 'Phishing Dataset/phishing-2018.txt',\n", 763 | " 'Phishing Dataset/phishing-2019.txt', 'Phishing Dataset/phishing-2020.txt']\n", 764 | " \n", 765 | " for file_path in file_path_list:\n", 766 | " # Read the full email content\n", 767 | " emailStr = ''\n", 768 | " try:\n", 769 | " with open(file_path, encoding='latin_1') as emailFile:\n", 770 | " for line in emailFile:\n", 771 | " emailStr += line\n", 772 | " except UnicodeDecodeError:\n", 773 | " print('Unicode Error!')\n", 774 | "\n", 775 | " email_list = emailStr.split('\\nFrom jose@monkey.org')\n", 776 | " #print(email_list[1])\n", 777 | "\n", 778 | " addEmailsToDict(email_list)\n", 779 | " \n", 780 | " dict_header_features = sorted(dict_header_features.items(), key=lambda x:x[1], reverse=True)\n", 781 | "\n", 782 | " for v in dict_header_features:\n", 783 | " print(v)\n", 784 | " \n", 785 | " unzipped = list(zip(*dict_header_features))\n", 786 | " lab = unzipped[0]\n", 787 | " val = unzipped[1]\n", 788 | " print(\"TOTAL EMAIL COUNT:\", total_emails)\n", 789 | " val = [x/total_emails for x in val]\n", 790 | " figure(figsize=(15, 6))\n", 791 | " axes = plt.gca()\n", 792 | " axes.set_ylim([0,1])\n", 793 | " plt.xticks(rotation='vertical')\n", 794 | " plt.ylabel('Percent of total emails')\n", 795 | " plt.bar(lab[0:20], val[0:20], width=0.25)\n", 796 | " plt.grid()\n", 797 | " plt.savefig('most_common_header_features_phish.png', bbox_inches=\"tight\")\n", 798 | " plt.show()\n", 799 | "\n", 800 | "if __name__ == '__main__':\n", 801 | " main()" 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "execution_count": null, 807 | "metadata": {}, 808 | "outputs": [], 809 | "source": [] 810 | } 811 | ], 812 | "metadata": { 813 | "kernelspec": { 814 | "display_name": "Python 3", 815 | "language": "python", 816 | "name": "python3" 817 | }, 818 | "language_info": { 819 | "codemirror_mode": { 820 | "name": "ipython", 821 | "version": 3 822 | }, 823 | "file_extension": ".py", 824 | "mimetype": "text/x-python", 825 | "name": "python", 826 | "nbconvert_exporter": "python", 827 | "pygments_lexer": "ipython3", 828 | "version": "3.8.5" 829 | } 830 | }, 831 | "nbformat": 4, 832 | "nbformat_minor": 4 833 | } 834 | -------------------------------------------------------------------------------- /Capstone Project - Testing Unsupervised, Phishing-Ham.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "RkQzXPCXUxme" 7 | }, 8 | "source": [ 9 | "# **Setup**" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 45, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from sklearn.ensemble import RandomForestClassifier\n", 19 | "from sklearn.svm import SVC\n", 20 | "from sklearn.naive_bayes import GaussianNB\n", 21 | "from sklearn.naive_bayes import MultinomialNB\n", 22 | "from sklearn.naive_bayes import BernoulliNB\n", 23 | "from sklearn.ensemble import AdaBoostClassifier\n", 24 | "from sklearn.ensemble import StackingClassifier\n", 25 | "from sklearn.neighbors import KNeighborsClassifier\n", 26 | "from sklearn.tree import DecisionTreeClassifier\n", 27 | "from sklearn.linear_model import LogisticRegression\n", 28 | "from sklearn.neural_network import MLPClassifier\n", 29 | "from sklearn.naive_bayes import MultinomialNB\n", 30 | "from sklearn.linear_model import LogisticRegression\n", 31 | "from sklearn.ensemble import GradientBoostingClassifier\n", 32 | "from matplotlib import pyplot as plt\n", 33 | "from sklearn.svm import OneClassSVM\n", 34 | "\n", 35 | "from sklearn.metrics import accuracy_score\n", 36 | "from sklearn.metrics import roc_auc_score\n", 37 | "from sklearn.metrics import f1_score\n", 38 | "from sklearn.metrics import recall_score\n", 39 | "from sklearn.metrics import precision_score\n", 40 | "from sklearn.metrics import confusion_matrix\n", 41 | "from sklearn.model_selection import cross_val_score\n", 42 | "from sklearn import metrics\n", 43 | "from sklearn.inspection import permutation_importance\n", 44 | "from matplotlib import pyplot as plt\n", 45 | "\n", 46 | "from sklearn.metrics import plot_roc_curve\n", 47 | "from sklearn.decomposition import PCA\n", 48 | "from matplotlib.pyplot import figure\n", 49 | "from sklearn.model_selection import GridSearchCV\n", 50 | "from sklearn.pipeline import Pipeline\n", 51 | "from sklearn.model_selection import StratifiedShuffleSplit\n", 52 | "\n", 53 | "import pandas as pd\n", 54 | "import numpy as np" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 46, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "df = pd.read_csv('preprocessed_spam_ham_phishing.csv')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "**Feature reduction:**" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "The only features that are kept are domain matching features, as these should generalize across very different email datasets without issue." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 47, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "feature_list = [\n", 87 | "'domain_match_from_return-path',\n", 88 | "'domain_match_message-id_from',\n", 89 | "'domain_match_message-id_return-path',\n", 90 | "'domain_match_to_from',\n", 91 | "'domain_match_errors-to_from',\n", 92 | "'domain_match_message-id_reply-to',\n", 93 | "'domain_match_errors-to_message-id',\n", 94 | "'domain_match_sender_from',\n", 95 | "'domain_match_to_received',\n", 96 | "'domain_match_errors-to_reply-to',\n", 97 | "'domain_match_to_message-id',\n", 98 | "'label']\n", 99 | "\n", 100 | "df = df[feature_list]" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "**Remove spam emails, only consider ham and phishing:**" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 48, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "(26508, 12)\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "df = df[df['label'] != 1]\n", 125 | "print(df.shape)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 49, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/plain": [ 136 | "0 25220\n", 137 | "2 1288\n", 138 | "Name: label, dtype: int64" 139 | ] 140 | }, 141 | "execution_count": 49, 142 | "metadata": {}, 143 | "output_type": "execute_result" 144 | } 145 | ], 146 | "source": [ 147 | "df['label'].value_counts()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 50, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "(25220, 12)\n", 160 | "(1288, 12)\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "df_ham = df[df['label'] == 0]\n", 166 | "df_phish = df[df['label'] == 2]\n", 167 | "print(df_ham.shape)\n", 168 | "print(df_phish.shape)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 51, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "df_phish = df_phish.assign(label=1)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 52, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "name": "stdout", 187 | "output_type": "stream", 188 | "text": [ 189 | "(1288, 12)\n", 190 | "(25220, 12)\n", 191 | "1 1288\n", 192 | "Name: label, dtype: int64\n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "print(df_phish.shape)\n", 198 | "print(df_ham.shape)\n", 199 | "print(df_phish['label'].value_counts())" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 53, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "df_ham_Y = df_ham['label']\n", 209 | "df_ham_X = df_ham.drop('label', axis=1)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "**Take 1288 of the ham emails to be used for testing, the rest for training:**" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 54, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "from sklearn.model_selection import train_test_split\n", 226 | "\n", 227 | "X_train, X_test_ham, y_train, y_test_ham = train_test_split(df_ham_X, df_ham_Y, test_size=1288, random_state=42)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 55, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "(23932, 11)\n", 240 | "(1288, 11)\n", 241 | "(23932,)\n", 242 | "(1288,)\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "print(X_train.shape)\n", 248 | "print(X_test_ham.shape)\n", 249 | "print(y_train.shape)\n", 250 | "print(y_test_ham.shape)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "**Create the test set, which is 1288 ham and 1288 phishing emails:**" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 56, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "df_phish_Y = df_phish['label']\n", 267 | "df_phish_X = df_phish.drop('label', axis=1)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 57, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "test_X = X_test_ham.append(df_phish_X, ignore_index=True)\n", 277 | "test_Y = y_test_ham.append(df_phish_Y, ignore_index=True)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 58, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "name": "stdout", 287 | "output_type": "stream", 288 | "text": [ 289 | "(2576, 11)\n", 290 | "(2576,)\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "print(test_X.shape)\n", 296 | "print(test_Y.shape)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "**OC-SVM predicts either 1 or -1, so need to adjust labels:**" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 59, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "test_Y = pd.DataFrame(test_Y, columns=['label'])\n", 313 | "test_Y.loc[test_Y['label'] == 1, 'label'] = -1\n", 314 | "test_Y.loc[test_Y['label'] == 0, 'label'] = 1" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 60, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/plain": [ 325 | "label\n", 326 | " 1 1288\n", 327 | "-1 1288\n", 328 | "dtype: int64" 329 | ] 330 | }, 331 | "execution_count": 60, 332 | "metadata": {}, 333 | "output_type": "execute_result" 334 | } 335 | ], 336 | "source": [ 337 | "test_Y.value_counts()" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": { 343 | "id": "ZfqcyuLalhxp" 344 | }, 345 | "source": [ 346 | "**Apply a standard scaler to the full data set:**" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 61, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "features_list = test_X.columns" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 62, 361 | "metadata": { 362 | "id": "j6TAQHBjlhJ6" 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "from sklearn.preprocessing import StandardScaler\n", 367 | "\n", 368 | "scaler = StandardScaler()\n", 369 | "\n", 370 | "scaler.fit(test_X)\n", 371 | "test_X = scaler.transform(test_X)\n", 372 | "test_X = pd.DataFrame(test_X, columns=features_list)\n", 373 | "\n", 374 | "scaler.fit(X_train)\n", 375 | "X_train = scaler.transform(X_train)\n", 376 | "X_train = pd.DataFrame(X_train, columns=features_list)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": { 382 | "id": "iGZ561te2ONO" 383 | }, 384 | "source": [ 385 | "# **Testing:**" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": { 391 | "id": "6XR4Wc3Fqh_W" 392 | }, 393 | "source": [ 394 | "**OC-SVM:**" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 63, 400 | "metadata": { 401 | "id": "oZ239URUq-tx" 402 | }, 403 | "outputs": [ 404 | { 405 | "name": "stdout", 406 | "output_type": "stream", 407 | "text": [ 408 | "Accuracy: 87.22826086956522\n", 409 | "F1 Score: 86.3768115942029\n", 410 | "Recall: 80.97826086956522\n", 411 | "Precision: 92.54658385093167\n", 412 | "ROC AUC: 87.22826086956523\n", 413 | "Confusion Matrix: [[1204 84]\n", 414 | " [ 245 1043]]\n", 415 | "Wall time: 10.2 s\n" 416 | ] 417 | } 418 | ], 419 | "source": [ 420 | "%%time\n", 421 | "\n", 422 | "ocsvm = OneClassSVM(kernel='poly', degree=6, nu=0.2)\n", 423 | "ocsvm.fit(X_train)\n", 424 | "\n", 425 | "# Test the model on the test set\n", 426 | "predictions = ocsvm.predict(test_X)\n", 427 | "\n", 428 | "# Get the evaluation metrics\n", 429 | "print('Accuracy:', accuracy_score(test_Y, predictions)*100)\n", 430 | "print('F1 Score:', f1_score(test_Y, predictions)*100)\n", 431 | "print('Recall:', recall_score(test_Y, predictions)*100)\n", 432 | "print('Precision:', precision_score(test_Y, predictions)*100)\n", 433 | "print('ROC AUC:', roc_auc_score(test_Y, predictions)*100)\n", 434 | "print('Confusion Matrix:', confusion_matrix(test_Y, predictions))" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [] 443 | } 444 | ], 445 | "metadata": { 446 | "colab": { 447 | "collapsed_sections": [ 448 | "sCMEqns8jY-1", 449 | "F5A997Ekjh0E", 450 | "KvWjb8GwOM-y", 451 | "rJJ4WHSBFTWc", 452 | "nqHSyA4SyJ7K", 453 | "xhkNe2ql-cFP", 454 | "fMwsPWxpi2cG", 455 | "-_nEjwFOjUDI", 456 | "MyekmTK3q5TC", 457 | "RkQzXPCXUxme", 458 | "P1zXPP6trxSN", 459 | "iGZ561te2ONO", 460 | "rffvHbuzGO8c" 461 | ], 462 | "name": "Capstone Project - Email Headers.ipynb", 463 | "provenance": [] 464 | }, 465 | "kernelspec": { 466 | "display_name": "Python 3", 467 | "language": "python", 468 | "name": "python3" 469 | }, 470 | "language_info": { 471 | "codemirror_mode": { 472 | "name": "ipython", 473 | "version": 3 474 | }, 475 | "file_extension": ".py", 476 | "mimetype": "text/x-python", 477 | "name": "python", 478 | "nbconvert_exporter": "python", 479 | "pygments_lexer": "ipython3", 480 | "version": "3.8.5" 481 | } 482 | }, 483 | "nbformat": 4, 484 | "nbformat_minor": 1 485 | } 486 | -------------------------------------------------------------------------------- /Capstone project - Extract HamSpam.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import shutil\n", 11 | "from os import listdir, mkdir, path\n", 12 | "from pathlib import Path\n", 13 | "import sys\n", 14 | "from email.parser import HeaderParser\n", 15 | "import collections\n", 16 | "import matplotlib.pyplot as plt\n", 17 | "from matplotlib.pyplot import figure\n", 18 | "import time\n", 19 | "from collections import Counter" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "1000\n", 32 | "13.240842342376709 seconds\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "parser = HeaderParser()\n", 38 | "columns = ['received1',\n", 39 | "'received2',\n", 40 | "'received3',\n", 41 | "'received4',\n", 42 | "'received5',\n", 43 | "'received6',\n", 44 | "'received7',\n", 45 | "'received8',\n", 46 | "'received9',\n", 47 | "'received10',\n", 48 | "'received11',\n", 49 | "'received12',\n", 50 | "'received13',\n", 51 | "'received14',\n", 52 | "'received15',\n", 53 | "'received16',\n", 54 | "'hops',\n", 55 | "'subject',\n", 56 | "'date',\n", 57 | "'message-id',\n", 58 | "'from',\n", 59 | "'return-path',\n", 60 | "'to',\n", 61 | "'content-type',\n", 62 | "'mime-version',\n", 63 | "'x-mailer',\n", 64 | "'content-transfer-encoding',\n", 65 | "'x-mimeole',\n", 66 | "'x-priority',\n", 67 | "'list-id',\n", 68 | "'lines',\n", 69 | "'x-virus-scanned',\n", 70 | "'status',\n", 71 | "'content-length',\n", 72 | "'precedence',\n", 73 | "'delivered-to',\n", 74 | "'list-unsubscribe',\n", 75 | "'list-subscribe',\n", 76 | "'list-post',\n", 77 | "'list-help',\n", 78 | "'x-msmail-priority',\n", 79 | "'x-spam-status',\n", 80 | "'sender',\n", 81 | "'errors-to',\n", 82 | "'x-beenthere',\n", 83 | "'list-archive',\n", 84 | "'reply-to',\n", 85 | "'x-mailman-version',\n", 86 | "'x-miltered',\n", 87 | "'x-uuid',\n", 88 | "'x-virus-status',\n", 89 | "'x-spam-level',\n", 90 | "'x-spam-checker-version',\n", 91 | "'references',\n", 92 | "'in-reply-to',\n", 93 | "'user-agent',\n", 94 | "'thread-index',\n", 95 | "'cc',\n", 96 | "'received-spf',\n", 97 | "'x-original-to',\n", 98 | "'content-disposition',\n", 99 | "'mailing-list',\n", 100 | "'x-spam-check-by',\n", 101 | "'domainkey-signature',\n", 102 | "'importance',\n", 103 | "'x-mailing-list',\n", 104 | "'label']\n", 105 | "\n", 106 | "list_of_rows = []\n", 107 | "\n", 108 | "def getIndexMap(index_path, data_path):\n", 109 | " index = {}\n", 110 | " with open(index_path, encoding='us-ascii') as index_file:\n", 111 | " for i, line in enumerate(index_file):\n", 112 | " type = line[0:4]\n", 113 | " file_path = line.split('/')[2][:-1]\n", 114 | " if type == 'spam':\n", 115 | " index[f'{data_path}{file_path}'] = 1\n", 116 | " else:\n", 117 | " index[f'{data_path}{file_path}'] = 0\n", 118 | " return index\n", 119 | "\n", 120 | "\n", 121 | "def addEmailToList(file_path, index):\n", 122 | " global parser, list_of_rows\n", 123 | " \n", 124 | " # The label for the email: ham or spam\n", 125 | " label = index[file_path]\n", 126 | " hops = 0\n", 127 | " emailStr = \"\"\n", 128 | " row_dict = {}\n", 129 | " \n", 130 | " # Read the full email content\n", 131 | " try:\n", 132 | " with open(file_path, encoding='latin_1') as emailFile:\n", 133 | " for line in emailFile:\n", 134 | " emailStr += line\n", 135 | " except UnicodeDecodeError:\n", 136 | " pass\n", 137 | " \n", 138 | " # Parse the email content\n", 139 | " h = parser.parsestr(emailStr)\n", 140 | " \n", 141 | " # Parse recieved field\n", 142 | " received_list = h.get_all('received')\n", 143 | " hops = 0\n", 144 | " if received_list is not None:\n", 145 | " hops = len(received_list)\n", 146 | " col_name_recieved = 'received'\n", 147 | " \n", 148 | " for inx, received_field in enumerate(received_list):\n", 149 | " col = col_name_recieved + str(inx+1)\n", 150 | " row_dict[col] = received_field\n", 151 | "\n", 152 | " \n", 153 | " # Make everything lowercase to avoid issues\n", 154 | " features_lower_case = [x.lower() for x in h.keys()]\n", 155 | " \n", 156 | " # Parse everything else\n", 157 | " new_row = dict(zip(features_lower_case, h.values()))\n", 158 | " new_row['hops'] = hops\n", 159 | " \n", 160 | " \n", 161 | " for key,value in new_row.items():\n", 162 | " if key in columns:\n", 163 | " row_dict['label'] = label\n", 164 | " row_dict[key] = value\n", 165 | " \n", 166 | " \n", 167 | " list_of_rows.append(row_dict)\n", 168 | "\n", 169 | "def main():\n", 170 | " global list_of_rows\n", 171 | " index = getIndexMap('trec07p/full/index', 'trec07p/data/')\n", 172 | " start_time = time.time()\n", 173 | " counter = 0\n", 174 | " # Go through each email in the data set\n", 175 | " for emailFile in listdir('trec07p/data'):\n", 176 | " \n", 177 | " # Show progress\n", 178 | " counter += 1\n", 179 | " if (counter % 1000 == 0):\n", 180 | " print(counter)\n", 181 | " print(time.time() - start_time, \"seconds\")\n", 182 | " start_time = time.time()\n", 183 | " path = 'trec07p/data/' + emailFile\n", 184 | " addEmailToList(path, index)\n", 185 | " \n", 186 | " # Create the dataframe\n", 187 | " df = pd.DataFrame(list_of_rows)\n", 188 | " df = df[columns]\n", 189 | " \n", 190 | " # Output the dataframe to a .csv file\n", 191 | " df.to_csv('extracted_spam_ham.csv', index=False)\n", 192 | "\n", 193 | "if __name__ == '__main__':\n", 194 | " main()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "**Show the most common header fields:**" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 7, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "1000\n", 214 | "2000\n", 215 | "3000\n", 216 | "4000\n", 217 | "5000\n", 218 | "6000\n", 219 | "7000\n", 220 | "8000\n", 221 | "9000\n", 222 | "10000\n", 223 | "11000\n", 224 | "12000\n", 225 | "13000\n", 226 | "14000\n", 227 | "15000\n", 228 | "16000\n", 229 | "17000\n", 230 | "18000\n", 231 | "19000\n", 232 | "20000\n", 233 | "21000\n", 234 | "22000\n", 235 | "23000\n", 236 | "24000\n", 237 | "25000\n", 238 | "26000\n", 239 | "27000\n", 240 | "28000\n", 241 | "29000\n", 242 | "30000\n", 243 | "31000\n", 244 | "32000\n", 245 | "33000\n", 246 | "34000\n", 247 | "35000\n", 248 | "36000\n", 249 | "37000\n", 250 | "38000\n", 251 | "39000\n", 252 | "40000\n", 253 | "41000\n", 254 | "42000\n", 255 | "43000\n", 256 | "44000\n", 257 | "45000\n", 258 | "46000\n", 259 | "47000\n", 260 | "48000\n", 261 | "49000\n", 262 | "50000\n", 263 | "51000\n", 264 | "52000\n", 265 | "53000\n", 266 | "54000\n", 267 | "55000\n", 268 | "56000\n", 269 | "57000\n", 270 | "58000\n", 271 | "59000\n", 272 | "60000\n", 273 | "61000\n", 274 | "62000\n", 275 | "63000\n", 276 | "64000\n", 277 | "65000\n", 278 | "66000\n", 279 | "67000\n", 280 | "68000\n", 281 | "69000\n", 282 | "70000\n", 283 | "71000\n", 284 | "72000\n", 285 | "73000\n", 286 | "74000\n", 287 | "75000\n", 288 | "('return-path', 75419)\n", 289 | "('received', 75419)\n", 290 | "('message-id', 75419)\n", 291 | "('from', 75419)\n", 292 | "('date', 75394)\n", 293 | "('subject', 75246)\n", 294 | "('to', 74865)\n", 295 | "('content-type', 73329)\n", 296 | "('mime-version', 68098)\n", 297 | "('x-mailer', 41804)\n", 298 | "('content-transfer-encoding', 33985)\n", 299 | "('x-mimeole', 28593)\n", 300 | "('x-priority', 27153)\n", 301 | "('lines', 26800)\n", 302 | "('status', 26269)\n", 303 | "('content-length', 26150)\n", 304 | "('x-virus-scanned', 23703)\n", 305 | "('precedence', 22158)\n", 306 | "('list-unsubscribe', 21745)\n", 307 | "('list-subscribe', 21294)\n", 308 | "('list-id', 21257)\n", 309 | "('list-post', 21244)\n", 310 | "('list-help', 21244)\n", 311 | "('reply-to', 19408)\n", 312 | "('x-msmail-priority', 18629)\n", 313 | "('x-spam-status', 17427)\n", 314 | "('sender', 16635)\n", 315 | "('errors-to', 16029)\n", 316 | "('x-miltered', 15633)\n", 317 | "('x-uuid', 15621)\n", 318 | "('x-virus-status', 15486)\n", 319 | "('x-beenthere', 15298)\n", 320 | "('list-archive', 15251)\n", 321 | "('x-mailman-version', 14769)\n", 322 | "('x-spam-level', 12965)\n", 323 | "('x-spam-checker-version', 12552)\n", 324 | "('references', 11595)\n", 325 | "('in-reply-to', 11424)\n", 326 | "('delivered-to', 10655)\n", 327 | "('thread-index', 10442)\n", 328 | "('user-agent', 9397)\n", 329 | "('cc', 8165)\n", 330 | "('x-original-to', 6842)\n", 331 | "('content-disposition', 6143)\n", 332 | "('received-spf', 4558)\n", 333 | "('mailing-list', 4510)\n", 334 | "('x-spam-check-by', 4504)\n", 335 | "('importance', 3936)\n", 336 | "('domainkey-signature', 3866)\n", 337 | "('x-mailing-list', 3626)\n", 338 | "('x-antiabuse', 3159)\n", 339 | "('x-source', 3140)\n", 340 | "('x-source-args', 3140)\n", 341 | "('x-source-dir', 3140)\n", 342 | "('x-mime-autoconverted', 2966)\n", 343 | "('x-originalarrivaltime', 2830)\n", 344 | "('x-old-spam-status', 2491)\n", 345 | "('x-old-spam-check-by', 2432)\n", 346 | "('organization', 2352)\n", 347 | "('x-spam', 2166)\n", 348 | "('x-mailing-list-name', 2155)\n", 349 | "('thread-topic', 2107)\n", 350 | "('x-originating-ip', 2065)\n", 351 | "('x-accept-language', 2052)\n", 352 | "('content-class', 2033)\n", 353 | "('resent-message-id', 1828)\n", 354 | "('resent-from', 1828)\n", 355 | "('resent-date', 1828)\n", 356 | "('dkim-signature', 1699)\n", 357 | "('x-antivirus', 1583)\n", 358 | "('x-loop', 1540)\n", 359 | "('resent-sender', 1492)\n", 360 | "('priority', 1486)\n", 361 | "('old-return-path', 1471)\n", 362 | "('x-rc-virus', 1471)\n", 363 | "('x-rc-spam', 1471)\n", 364 | "('x-ms-tnef-correlator', 1230)\n", 365 | "('x-ms-has-attach', 1229)\n", 366 | "('x-sender', 1204)\n", 367 | "('x-originating-email', 834)\n", 368 | "('x-ymail-osg', 824)\n", 369 | "('broadcastrecipientid', 800)\n", 370 | "('broadcastjobid', 800)\n", 371 | "('mail-followup-to', 708)\n", 372 | "('x-antivirus-status', 692)\n", 373 | "('x-enigmail-version', 678)\n", 374 | "('x-complaints-to', 653)\n", 375 | "('x-destination-id', 550)\n", 376 | "('x-mailed-to', 543)\n", 377 | "('x-to', 543)\n", 378 | "('content-base', 536)\n", 379 | "('x-asg-debug-id', 535)\n", 380 | "('x-barracuda-url', 535)\n", 381 | "('x-asg-orig-subj', 535)\n", 382 | "('x-barracuda-connect', 534)\n", 383 | "('x-barracuda-start-time', 534)\n", 384 | "('x-barracuda-virus-scanned', 534)\n", 385 | "('x-original-date', 529)\n", 386 | "('error-to', 524)\n", 387 | "('x-posted-by', 502)\n", 388 | "('x-barracuda-spam-score', 501)\n", 389 | "('x-barracuda-spam-status', 501)\n", 390 | "('x-barracuda-spam-report', 500)\n", 391 | "('x-google-sender-auth', 494)\n", 392 | "('x-virtualservergroup', 490)\n", 393 | "('x-mailingid', 490)\n", 394 | "('x-smfbl', 490)\n", 395 | "('x-smheadermap', 490)\n", 396 | "('x-barracuda-bayes', 488)\n", 397 | "('x-virtualserver', 465)\n", 398 | "('x-keywords', 465)\n", 399 | "('x-rt-loop-prevention', 417)\n", 400 | "('rt-ticket', 417)\n", 401 | "('managed-by', 417)\n", 402 | "('rt-originator', 417)\n", 403 | "('x-rt-original-encoding', 417)\n", 404 | "('x-cguid', 335)\n", 405 | "('x-uguid', 335)\n", 406 | "('message-type', 335)\n", 407 | "('x-trace', 332)\n", 408 | "('x-nabble-from', 317)\n", 409 | "('resent-to', 313)\n", 410 | "('x-injected-via-gmane', 305)\n", 411 | "('x-gmane-nntp-posting-host', 305)\n", 412 | "('nntp-posting-host', 301)\n", 413 | "('nntp-posting-date', 301)\n", 414 | "('x-mailman-approved-at', 295)\n", 415 | "('x-rt-newticket', 287)\n", 416 | "('x-http-useragent', 286)\n", 417 | "('complaints-to', 284)\n", 418 | "('injection-info', 284)\n", 419 | "('x-virus-checked', 283)\n", 420 | "('mbox-line', 281)\n", 421 | "('x-scanned-by', 270)\n", 422 | "('x-ironport-av', 255)\n", 423 | "('x-rav-antivirus', 252)\n", 424 | "('x-mimetrack', 234)\n", 425 | "('x-id', 233)\n", 426 | "('x-authentication-warning', 233)\n", 427 | "('x-spam-score', 229)\n", 428 | "('x-greylist', 211)\n", 429 | "('x-bulkmail', 208)\n", 430 | "('x-provags-id', 200)\n", 431 | "('x-content-filtered-by', 198)\n", 432 | "('x-x-sender', 177)\n", 433 | "('x-userid', 175)\n", 434 | "('x-recipient', 169)\n", 435 | "('x-barracuda-encrypted', 167)\n", 436 | "('x-pgp-key', 162)\n", 437 | "('original-recipient', 158)\n", 438 | "('x-firewall-bypass', 148)\n", 439 | "('x-ip', 147)\n", 440 | "('brma', 147)\n", 441 | "('brmasmtpauthuser', 147)\n", 442 | "('x-authenticated', 146)\n", 443 | "('x-spam-flag', 143)\n", 444 | "('x-operating-system', 141)\n", 445 | "('x-pmx-version', 140)\n", 446 | "('x-plaintext', 139)\n", 447 | "('x-message-id', 133)\n", 448 | "('x-redirectstatus', 132)\n", 449 | "('x-elnk-trace', 127)\n", 450 | "('openpgp', 125)\n", 451 | "('x-newsreader', 125)\n", 452 | "('x-wss-id', 123)\n", 453 | "('list-owner', 122)\n", 454 | "('x-server-uuid', 122)\n", 455 | "('x-kaspersky-antivirus', 116)\n", 456 | "('x-authenticated-user', 103)\n", 457 | "('x-library', 103)\n", 458 | "('cancel-lock', 103)\n", 459 | "('x-apparently-to', 98)\n", 460 | "('content-language', 97)\n", 461 | "('x-gmx-antivirus', 96)\n", 462 | "('x-message-info', 93)\n", 463 | "('x-quotation', 90)\n", 464 | "('x-url', 88)\n", 465 | "('x-loom-ip', 86)\n", 466 | "('bcc', 83)\n", 467 | "('x-y-gmx-trusted', 82)\n", 468 | "('x-http-via', 81)\n", 469 | "('x-proofpoint-virus-version', 80)\n", 470 | "('x-job', 79)\n", 471 | "('x-no-archive', 75)\n", 472 | "('comment', 73)\n", 473 | "('x-fb05-mailscanner', 73)\n", 474 | "('x-fb05-mailscanner-spamcheck', 73)\n", 475 | "('x-fb05-mailscanner-from', 73)\n", 476 | "('x-brightmail-tracker', 71)\n", 477 | "('x-chzlrs', 68)\n", 478 | "('x-ironport-anti-spam-filtered', 64)\n", 479 | "('x-ironport-anti-spam-result', 64)\n", 480 | "('x-sa-exim-connect-ip', 64)\n", 481 | "('comments', 63)\n", 482 | "('x-spam-report', 62)\n", 483 | "('x-originating-server', 61)\n", 484 | "('x-orcpt', 60)\n", 485 | "('x-me-uuid', 59)\n", 486 | "('x-cpanel-mailscanner-information', 59)\n", 487 | "('x-cpanel-mailscanner', 59)\n", 488 | "('x-cpanel-mailscanner-spamcheck', 59)\n", 489 | "('x-cpanel-mailscanner-from', 59)\n", 490 | "('x-envelope-from', 58)\n", 491 | "('x-pstn-levels', 58)\n", 492 | "('x-pstn-settings', 58)\n", 493 | "('x-pstn-addresses', 58)\n", 494 | "('x-pps', 57)\n", 495 | "('approved-by', 57)\n", 496 | "('newsgroups', 57)\n", 497 | "('x-public-key-id', 57)\n", 498 | "('x-public-key-fingerprint', 57)\n", 499 | "('x-public-key-url', 57)\n", 500 | "('x-post-from', 57)\n", 501 | "('mail-copies-to', 57)\n", 502 | "('x-st-mf-message-resent', 56)\n", 503 | "('auto-submitted', 55)\n", 504 | "('content-description', 54)\n", 505 | "('x-subscription', 53)\n", 506 | "('x-verification', 53)\n", 507 | "('x_id', 53)\n", 508 | "('organisation', 53)\n", 509 | "('domainkey-status', 53)\n", 510 | "('x-enigmail-supports', 52)\n", 511 | "('x-from_', 52)\n", 512 | "('x-sa-exim-mail-from', 50)\n", 513 | "('x-return-path', 50)\n", 514 | "('x-mdaemon-deliver-to', 50)\n", 515 | "('x-useless-header', 49)\n", 516 | "('x-orbl', 49)\n", 517 | "('xauthentication-warning', 49)\n", 518 | "('x-x-quotation', 48)\n", 519 | "('x-sa-exim-scanned', 48)\n", 520 | "('x-pplu', 47)\n", 521 | "('sensitivity', 47)\n", 522 | "('x-user_ip', 45)\n", 523 | "('x-remove', 45)\n", 524 | "('x-mdrcpt-to', 45)\n", 525 | "('x-sasl-enc', 43)\n", 526 | "('envelope-from', 42)\n", 527 | "('x_email_key', 42)\n", 528 | "('x-brightmailfiltered', 41)\n", 529 | "('x-spln-id', 40)\n", 530 | "('x-scan-signature', 40)\n", 531 | "('x-fhcrc-scanned', 39)\n", 532 | "('x-face', 37)\n", 533 | "('x-sa-exim-version', 37)\n", 534 | "('x-junkmail-status', 36)\n", 535 | "('x-junkmail-sd-raw', 36)\n", 536 | "('x-uwash-spam', 36)\n", 537 | "('x-received', 36)\n", 538 | "('x-fraunhofer-email-policy', 36)\n", 539 | "('x-list-owner-address', 36)\n", 540 | "('x-rfc2646', 36)\n", 541 | "('x-imss-result', 33)\n", 542 | "('x-rzg-auth', 33)\n", 543 | "('x-rzg-class-id', 33)\n", 544 | "('uei', 33)\n", 545 | "('xmessage-id', 33)\n", 546 | "('x-originatingip', 32)\n", 547 | "('x-spamfilter-host', 32)\n", 548 | "('x-no-cc', 32)\n", 549 | "('x-pgp', 31)\n", 550 | "('x-lsv-listid', 31)\n", 551 | "('x-header', 30)\n", 552 | "('x-ucl-mailscanner-information', 30)\n", 553 | "('x-ucl-mailscanner', 30)\n", 554 | "('x-ucl-mailscanner-from', 30)\n", 555 | "('x-ecs-mailscanner-bbsrc', 30)\n", 556 | "('x-scanner', 29)\n", 557 | "('x-senderip', 29)\n", 558 | "('x-imss-version', 29)\n", 559 | "('x-asg-whitelist', 29)\n", 560 | "('x-viruschecked', 29)\n", 561 | "('x-env-sender', 29)\n", 562 | "('x-starscan-version', 29)\n", 563 | "('x-sbrs-gw1', 29)\n", 564 | "('x-sensitivity', 28)\n", 565 | "('x-xam3-api-version', 28)\n", 566 | "('x-imss-tmaseresult', 28)\n", 567 | "('x-imss-settings', 28)\n", 568 | "('x-msg-ref', 28)\n", 569 | "('x-list-software', 28)\n", 570 | "('x-nai-spam-score', 28)\n", 571 | "('x-abuse-info', 28)\n", 572 | "('x-return-path', 2)\n", 1019 | "('x-fcul-mailscanner-information', 2)\n", 1020 | "('x-fcul-mailscanner', 2)\n", 1021 | "('x-fcul-mailscanner-from', 2)\n", 1022 | "('x-password', 2)\n", 1023 | "('x-source-sender', 2)\n", 1024 | "('x-kclspamscore', 2)\n", 1025 | "('x-kclrealspamscore', 2)\n", 1026 | "('x-kclzstatus', 2)\n", 1027 | "('x-kclspamreport', 2)\n", 1028 | "('x-kcl-mailscanner', 2)\n", 1029 | "('x-delivery-agent', 2)\n", 1030 | "('x-primary-address', 2)\n", 1031 | "('x-mirapoint-rapid-raw', 2)\n", 1032 | "('x-mirapoint-loop-id', 2)\n", 1033 | "('x-mobistatus', 2)\n", 1034 | "('x-mobivirusstatus', 2)\n", 1035 | "('x-mobispamstatus', 2)\n", 1036 | "('x-remoteip', 2)\n", 1037 | "('xoriginalsenderip', 2)\n", 1038 | "('x-cgnet-com-mailscanner-information', 2)\n", 1039 | "('x-cgnet-com-mailscanner', 2)\n", 1040 | "('x-cgnet-com-mailscanner-from', 2)\n", 1041 | "('x-songbirdinformation', 2)\n", 1042 | "('x-songbird', 2)\n", 1043 | "('x-songbird-from', 2)\n", 1044 | "('x-m', 2)\n", 1045 | "('x-uob-sender', 2)\n", 1046 | "('x-snort', 2)\n", 1047 | "('x-ipaddress', 2)\n", 1048 | "('x-abuse', 2)\n", 1049 | "('x-sender-info', 2)\n", 1050 | "('x-sgul-mailscanner-mh2', 2)\n", 1051 | "('x-infomail-id', 2)\n", 1052 | "('x-length', 2)\n", 1053 | "('x-sgul-mailscanner-mh1', 2)\n", 1054 | "('x-sig5', 2)\n", 1055 | "('x-planet', 2)\n", 1056 | "('x-fbk', 2)\n", 1057 | "('x-spamtest-envelope-from', 2)\n", 1058 | "('x-spamtest-group-id', 2)\n", 1059 | "('x-spamtest-method', 2)\n", 1060 | "('x-spamtest-rate', 2)\n", 1061 | "('x-spamtest-status-extended', 2)\n", 1062 | "('x-server', 2)\n", 1063 | "('x-origin-ip', 2)\n", 1064 | "('x-gpg', 2)\n", 1065 | "('authenticated-user', 2)\n", 1066 | "('x-univ-perp-mailscanner-information', 2)\n", 1067 | "('x-univ-perp-mailscanner', 2)\n", 1068 | "('x-univ-perp-mailscanner-from', 2)\n", 1069 | "('x-blargav-status', 2)\n", 1070 | "('x-prussianet-mailscanner-information', 2)\n", 1071 | "('x-prussianet-mailscanner', 2)\n", 1072 | "('x-prussianet-mailscanner-spamcheck', 2)\n", 1073 | "('x-prussianet-mailscanner-from', 2)\n", 1074 | "('x-accepted-file-formats', 2)\n", 1075 | "('x-ui-msg-verification', 2)\n", 1076 | "('x-return', 2)\n", 1077 | "('x-j-chkmail-score', 2)\n", 1078 | "('x-eric-conspiracy', 2)\n", 1079 | "('x-hellofbi-1', 2)\n", 1080 | "('x-hellofbi-2', 2)\n", 1081 | "('x-openpgp-key', 2)\n", 1082 | "('x-openpgp-key-fingerprint', 2)\n", 1083 | "('x-ctuhulu', 2)\n", 1084 | "('seal-send-time', 2)\n", 1085 | "('x-location', 2)\n", 1086 | "('x-tud-virus-scanned', 2)\n", 1087 | "('x-csir-mailscanner-information', 2)\n", 1088 | "('x-csir-mailscanner', 2)\n", 1089 | "('x-bcg-mailscanner-information', 2)\n", 1090 | "('x-bcg-mailscanner', 2)\n", 1091 | "('x-bcg-mailscanner-from', 2)\n", 1092 | "('x-pmx-spam', 2)\n", 1093 | "('x-pmx-spam-level', 2)\n", 1094 | "('x-s110firewall-mailscanner-information', 2)\n", 1095 | "('x-s110firewall-mailscanner', 2)\n", 1096 | "('x-s110firewall-mailscanner-spamcheck', 2)\n", 1097 | "('x-s110firewall-mailscanner-spamscore', 2)\n", 1098 | "('x-ids', 2)\n", 1099 | "('x-orig-x-trace', 2)\n", 1100 | "('x-gentoo-version', 2)\n", 1101 | "('x-cis-mailscanner', 2)\n", 1102 | "('x-cis-mailscanner-spamcheck', 2)\n", 1103 | "('x-cis-mailscanner-from', 2)\n", 1104 | "('x-authenticated-sender-id', 2)\n", 1105 | "('x-fun', 2)\n", 1106 | "('x-originate-ip', 2)\n", 1107 | "('x-emid', 2)\n", 1108 | "('x-mintra-mailscanner-information', 2)\n", 1109 | "('x-mintra-mailscanner', 2)\n", 1110 | "('face', 2)\n", 1111 | "('x-spamscan-3', 2)\n", 1112 | "('smtp-auth', 2)\n", 1113 | "('x-originid', 2)\n", 1114 | "('x-kontent-script', 2)\n", 1115 | "('x-kontent-sender', 2)\n", 1116 | "('x-mundosaparte-mailscanner-information', 1)\n", 1117 | "('x-mundosaparte-mailscanner', 1)\n", 1118 | "('x-mundosaparte-mailscanner-from', 1)\n", 1119 | "('x-virus-infected', 1)\n", 1120 | "('x-ironportlistener', 1)\n", 1121 | "('x-smartmax-authuser', 1)\n", 1122 | "('x-usanet-auth', 1)\n", 1123 | "('z-usanet-msgid', 1)\n", 1124 | "('x-client-addr', 1)\n", 1125 | "('x-client-sender', 1)\n", 1126 | "('x-masf', 1)\n", 1127 | "('x-handler', 1)\n", 1128 | "('x-sagator-scanner', 1)\n", 1129 | "('x-sagator-id', 1)\n", 1130 | "('x-uon-mailscanner-information', 1)\n", 1131 | "('x-uon-mailscanner', 1)\n", 1132 | "('x-uon-mailscanner-from', 1)\n", 1133 | "('x-www4.wculife.com-msgid', 1)\n", 1134 | "('x-www7.portrait4me.com-msgid', 1)\n", 1135 | "('x-www2.ttands.com-msgid', 1)\n", 1136 | "('x-ksg-checked', 1)\n", 1137 | "('goowy', 1)\n", 1138 | "('x-campaignid', 1)\n", 1139 | "('x-gmail-account', 1)\n", 1140 | "('x-mediabeam-mailscanner-information', 1)\n", 1141 | "('x-mediabeam-virusprotection', 1)\n", 1142 | "('x-reportbug-version', 1)\n", 1143 | "('x-dad-mailscanner-information', 1)\n", 1144 | "('x-dad-mailscanner', 1)\n", 1145 | "('x-dad-mailscanner-from', 1)\n", 1146 | "('x-vwebmail-auth', 1)\n", 1147 | "('x-no-spam-score', 1)\n", 1148 | "('trusted-delivery-validation-state', 1)\n", 1149 | "('x-envelope-sender', 1)\n", 1150 | "('x-remote-ip', 1)\n", 1151 | "('x-eyou-spamvalue', 1)\n", 1152 | "('x-eyou-dealdrc', 1)\n", 1153 | "('x-imim-mailscanner-information', 1)\n", 1154 | "('x-imim-mailscanner', 1)\n", 1155 | "('x-imim-mailscanner-from', 1)\n", 1156 | "('x-antivirus-arale-mail-from', 1)\n", 1157 | "('x-antivirus-arale', 1)\n", 1158 | "('x-uea-spam-score', 1)\n", 1159 | "('x-uea-spam-level', 1)\n", 1160 | "('x-uea-spam-flag', 1)\n", 1161 | "('mymasteradd', 1)\n", 1162 | "('mysourceid', 1)\n", 1163 | "('mybouncemid', 1)\n", 1164 | "('mybounceaddr', 1)\n", 1165 | "('x-nceas-mailscanner-information', 1)\n", 1166 | "('x-nceas-mercury-mailscanner', 1)\n", 1167 | "('x-nceas-mailscanner-mcpcheck', 1)\n", 1168 | "('x-nceas-mercury-mailscanner-spamcheck', 1)\n", 1169 | "('x-physci2-scanned', 1)\n", 1170 | "('x-physci2-outgoing', 1)\n", 1171 | "('x-gnupg-fingerprint', 1)\n", 1172 | "('x-fb-ss', 1)\n", 1173 | "('x-msgdaycount', 1)\n", 1174 | "('x-borndate', 1)\n", 1175 | "('automatic-legal-notices', 1)\n", 1176 | "('x-yoursite-mailscanner-information', 1)\n", 1177 | "('x-yoursite-mailscanner', 1)\n", 1178 | "('x-x-spamdetect', 1)\n", 1179 | "('x-notascii', 1)\n", 1180 | "('x-elistas-recipient', 1)\n", 1181 | "('x-offlineimap-2090390643-73747269706579-726563656e745f636f72726573706f6e64656e6365', 1)\n", 1182 | "('x-iglou-customer', 1)\n", 1183 | "('x-centeq-mailscanner-information', 1)\n", 1184 | "('x-centeq-mailscanner', 1)\n", 1185 | "('x-centeq-mailscanner-from', 1)\n", 1186 | "('x-exchangesecure-antispam', 1)\n", 1187 | "('x-from-line', 1)\n", 1188 | "('x-server-name', 1)\n", 1189 | "('x-script-name', 1)\n", 1190 | "('x-remote-addr', 1)\n", 1191 | "('x-abuse-to', 1)\n", 1192 | "('x-countrycode', 1)\n", 1193 | "('x-matched-lists', 1)\n", 1194 | "('x-scalix-hops', 1)\n", 1195 | "('x-sent-to', 1)\n", 1196 | "('x-failed-recipients', 1)\n", 1197 | "('x-delete-me', 1)\n", 1198 | "('x-campid', 1)\n", 1199 | "('x-gpg-signature', 1)\n", 1200 | "('x-cryst-bbk-mailscanner-information', 1)\n", 1201 | "('x-cryst-bbk-mailscanner', 1)\n", 1202 | "('x-cryst-bbk-mailscanner-spamcheck', 1)\n", 1203 | "('x-cryst-bbk-mailscanner-from', 1)\n", 1204 | "('x-mxguard-info', 1)\n", 1205 | "('x-mxguard-spoolid', 1)\n", 1206 | "('x-mxguard-sender', 1)\n", 1207 | "('x-mxguard-virus-info', 1)\n", 1208 | "('x-mxguard-trusted', 1)\n", 1209 | "('x-mxguard-spam-score', 1)\n", 1210 | "('x-mxguard-spam-probability', 1)\n", 1211 | "('x-nopam-status', 1)\n", 1212 | "('x-nopam-diag', 1)\n", 1213 | "('x-en-userinfo', 1)\n", 1214 | "('x-en-authuser', 1)\n", 1215 | "('x-en-orighost', 1)\n", 1216 | "('x-copfilter', 1)\n", 1217 | "('x-filtered-with-copfilter', 1)\n", 1218 | "('x-copfilter-virus-scanned', 1)\n", 1219 | "('x-copfilter-originating-ip', 1)\n", 1220 | "('x-internetksc-mailscanner-information', 1)\n", 1221 | "('x-internetksc-mailscanner', 1)\n", 1222 | "('x-internetksc-mailscanner-spamcheck', 1)\n", 1223 | "('x-internetksc-mailscanner-spamscore', 1)\n", 1224 | "('x-internetksc-mailscanner-from', 1)\n", 1225 | "('x-umn-remote-mta', 1)\n", 1226 | "('x-serial', 1)\n", 1227 | "('x-inquiry-date', 1)\n", 1228 | "('x-inquiry-ip', 1)\n", 1229 | "('x-responder-id', 1)\n", 1230 | "('x-spam-tests', 1)\n", 1231 | "('x-spam-see', 1)\n", 1232 | "('x-spam-scored', 1)\n", 1233 | "('x-spam-summary', 1)\n", 1234 | "('x-spam-autolearn', 1)\n", 1235 | "('x-spam-scan-date', 1)\n", 1236 | "('x-spam-dccd-results', 1)\n", 1237 | "('x-spam-rbl-results', 1)\n", 1238 | "('x-gpg-key-fingerprint', 1)\n", 1239 | "('x-initiated-by', 1)\n", 1240 | "('x-spam-prev-subject', 1)\n", 1241 | "('delivery-date', 1)\n", 1242 | "('x-pietru-qal', 1)\n", 1243 | "('x-pietru-spamcheck', 1)\n", 1244 | "('x-pietru-from', 1)\n", 1245 | "('x-cloudmark-score', 1)\n", 1246 | "('x-spot', 1)\n", 1247 | "('x-lycos-as', 1)\n", 1248 | "('x-lycos-av', 1)\n", 1249 | "('x-lycos-is', 1)\n", 1250 | "('x-nai-spam-level', 1)\n", 1251 | "('x-sendera', 1)\n", 1252 | "('x-neusoft-mailscanner-information', 1)\n", 1253 | "('x-neusoft-mailscanner', 1)\n", 1254 | "('x-tiptop-mailscanner-information', 1)\n", 1255 | "('x-tiptop-mailscanner', 1)\n", 1256 | "('x-tiptop-mailscanner-from', 1)\n", 1257 | "('accept', 1)\n", 1258 | "('x-ecartis-antiloop', 1)\n", 1259 | "('expiry-date', 1)\n", 1260 | "('x-env-recipient', 1)\n", 1261 | "('x-end-of-envelope', 1)\n", 1262 | "('x-qhpsi', 1)\n", 1263 | "('x-disclaimer', 1)\n", 1264 | "('x-mailscanner', 1)\n", 1265 | "('x-ucalgary-mailscanner-information', 1)\n", 1266 | "('x-ucalgary-mailscanner', 1)\n", 1267 | "('x-ucalgary-mailscanner-from', 1)\n", 1268 | "('auth-sender', 1)\n", 1269 | "('x-uri-orig', 1)\n", 1270 | "('x-uniqueid', 1)\n", 1271 | "('x-debian-version', 1)\n", 1272 | "('x-mailscanner-mluri-information', 1)\n", 1273 | "('x-mailscanner-mluri', 1)\n", 1274 | "('x-mailscanner-mluri-spamcheck', 1)\n", 1275 | "('x-mailscanner-mluri-from', 1)\n", 1276 | "('x-asmtp', 1)\n", 1277 | "('final-recipient', 1)\n", 1278 | "('x-loop-detect', 1)\n", 1279 | "('x-autogenerated', 1)\n", 1280 | "('x-originating-auth', 1)\n", 1281 | "('x-virus-scan-time', 1)\n", 1282 | "('x-http-host', 1)\n", 1283 | "('x-webmail-userid', 1)\n", 1284 | "('x-exp32-serialno', 1)\n", 1285 | "('x-confirmation-reading-to', 1)\n", 1286 | "('x-rcpt-level', 1)\n", 1287 | "('x-space1', 1)\n", 1288 | "('x-\"number', 1)\n", 1289 | "('x-<1157577119.46536', 1)\n", 1290 | "('x-{', 1)\n", 1291 | "('x-a', 1)\n", 1292 | "('x->', 1)\n", 1293 | "('x-|', 1)\n", 1294 | "('x-johan-meskens-cs3s-computer~', 1)\n", 1295 | "('x-webmail-company', 1)\n", 1296 | "('x-ucd-spam-score', 1)\n", 1297 | "('x-virii-scanned', 1)\n", 1298 | "('x-dinascanner-information', 1)\n", 1299 | "('x-dinascanner', 1)\n", 1300 | "('x-dinascanner-spamcheck', 1)\n", 1301 | "('x-dinascanner-from', 1)\n", 1302 | "('x-policy', 1)\n", 1303 | "('x-group', 1)\n", 1304 | "('x-lasthop', 1)\n", 1305 | "('x-af-id', 1)\n", 1306 | "('x-af-pwd', 1)\n", 1307 | "('x-af-rec', 1)\n", 1308 | "('x-cthulhu', 1)\n", 1309 | "('x-pgp-fingerprint', 1)\n", 1310 | "('x-coven', 1)\n", 1311 | "('x-cat', 1)\n", 1312 | "('x-mice', 1)\n", 1313 | "('x-rip-1', 1)\n", 1314 | "('x-rip-2', 1)\n", 1315 | "('x-rip-3', 1)\n", 1316 | "('x-rip-4', 1)\n", 1317 | "('x-rip-5', 1)\n", 1318 | "('x-geekcode-1', 1)\n", 1319 | "('x-geekcode-2', 1)\n", 1320 | "('x-spamcatcher-score', 1)\n", 1321 | "('x-user-agent', 1)\n", 1322 | "('x-wp-ip', 1)\n", 1323 | "('x-wp-av', 1)\n", 1324 | "('x-wp-spam', 1)\n", 1325 | "('x-testing-watch-package', 1)\n", 1326 | "('x-testing-watch-version', 1)\n", 1327 | "('x-ctinetworks-information', 1)\n", 1328 | "('x-ctinetworks-viruscheck', 1)\n", 1329 | "('x-ctinetworks-spamcheck', 1)\n", 1330 | "('x-mlf-version', 1)\n", 1331 | "('x-mlf-uniqueid', 1)\n", 1332 | "('x-gpg-key-server', 1)\n", 1333 | "('x-mapi-message-class', 1)\n", 1334 | "('x-ms-embedded-report', 1)\n", 1335 | "('x-onepass', 1)\n", 1336 | "('x-sef-e2d26921-3e31-4cce-94a-7ce64201448', 1)\n", 1337 | "('x-spamgourmet', 1)\n", 1338 | "('x-s0phie-scan', 1)\n", 1339 | "('delivery-class', 1)\n", 1340 | "('x-mailerisp', 1)\n", 1341 | "('x-company_id', 1)\n", 1342 | "('x-cts-enabled', 1)\n", 1343 | "('x-campaign', 1)\n", 1344 | "('x-cp-rms-expiry', 1)\n", 1345 | "('x-cp-rms-type', 1)\n", 1346 | "('x-cp-rms-size', 1)\n", 1347 | "('x-cp-rms-site', 1)\n", 1348 | "('x-cp-rms-display', 1)\n", 1349 | "('x-cp-ps-service', 1)\n", 1350 | "('x-classifier', 1)\n", 1351 | "('x-protective-marking', 1)\n", 1352 | "('x-wsmtpid', 1)\n", 1353 | "('x-wsmtpts', 1)\n", 1354 | "('x-wsmtpmk', 1)\n", 1355 | "('x-wsmtpck', 1)\n", 1356 | "('x-modus-audit', 1)\n", 1357 | "('x-receiver', 1)\n", 1358 | "('operating-system', 1)\n", 1359 | "('mua', 1)\n", 1360 | "('x-homemaster', 1)\n", 1361 | "('x-homemaster-trace', 1)\n", 1362 | "('x-homeemaster-team-photo', 1)\n", 1363 | "('x-sender-host', 1)\n", 1364 | "('x-script-filename', 1)\n", 1365 | "('x-sohu-uid', 1)\n", 1366 | "('x-sohu-antivirus', 1)\n", 1367 | "('x-coremail-antispam', 1)\n", 1368 | "('x-antivirus-mydomain-message-id', 1)\n", 1369 | "('x-eyoumail-smtpauth', 1)\n", 1370 | "('x-isumailhub-test', 1)\n", 1371 | "('x-hsu-virusscan', 1)\n", 1372 | "('x-imapbase', 1)\n", 1373 | "('x-sf-loop', 1)\n", 1374 | "('x-original-authlogin', 1)\n", 1375 | "('x-av-checked[2ab7]', 1)\n", 1376 | "('x-av-updated[2ab7]', 1)\n", 1377 | "('x-av-version[2ab7]', 1)\n", 1378 | "('x-smtp-vilter-version', 1)\n", 1379 | "('x-mimedefang-filter', 1)\n", 1380 | "('x-originator', 1)\n", 1381 | "('x-mailscanner-information-wiesinger.com', 1)\n", 1382 | "('x-mailscanner-wiesinger.com', 1)\n", 1383 | "('x-mailscanner-spamcheck-wiesinger.com', 1)\n", 1384 | "('x-loopdetect', 1)\n", 1385 | "('x-senderbasespam', 1)\n", 1386 | "('x-uqam-spam-filter', 1)\n", 1387 | "('x-tuid', 1)\n", 1388 | "('x-envelope-recipient', 1)\n", 1389 | "('x-virus-scanned-by', 1)\n", 1390 | "('x-screened', 1)\n", 1391 | "('x-vontu', 1)\n", 1392 | "('x-rocketymmf', 1)\n", 1393 | "('x-echelon-enable', 1)\n", 1394 | "('x-tkuev', 1)\n", 1395 | "('x-brown-proofpoint', 1)\n", 1396 | "('x-brown-mailscanner-spamcheck', 1)\n", 1397 | "('x-tm-as-user-approved-sender', 1)\n", 1398 | "('x-tm-as-user-blocked-sender', 1)\n", 1399 | "('x-kwf-filterprogress', 1)\n", 1400 | "('x-ufl-spam-status', 1)\n", 1401 | "('x-ufl-scanned-by', 1)\n", 1402 | "('x-warning-html-mail2', 1)\n", 1403 | "('x-zelazny-quote', 1)\n", 1404 | "('x-pgp-fp', 1)\n", 1405 | "('x-version', 1)\n", 1406 | "('x-senderid', 1)\n", 1407 | "('x-mc-editor', 1)\n", 1408 | "('x-mc-debug', 1)\n", 1409 | "('x-companyurl', 1)\n", 1410 | "('x-antivirus-code', 1)\n", 1411 | "('ironport-content-filter', 1)\n", 1412 | "('x-peacefulcomputing-mailscanner-information', 1)\n", 1413 | "('x-peacefulcomputing-mailscanner', 1)\n", 1414 | "('x-blackcat-spam-score', 1)\n", 1415 | "('x-archived', 1)\n", 1416 | "('x-activenetwerx-mailscanner-esva-spamscore', 1)\n", 1417 | "('x-antivirus-nicochan-mail-from', 1)\n", 1418 | "('x-antivirus-nicochan', 1)\n", 1419 | "('x-iua-mailscanner', 1)\n", 1420 | "('x-iua-mailscanner-spamcheck', 1)\n", 1421 | "('x-iua-mailscanner-from', 1)\n", 1422 | "('x-cce-unipr-mailscanner-information', 1)\n", 1423 | "('x-cce-unipr-ne-mailscanner', 1)\n", 1424 | "('x-cce-unipr-ne-mailscanner-spamcheck', 1)\n", 1425 | "('x-cce-unipr-mailscanner-spamscore', 1)\n", 1426 | "('x-cce-unipr-mailscanner-from', 1)\n", 1427 | "('x-magma-mailscanner-information', 1)\n", 1428 | "('x-magma-mailscanner', 1)\n", 1429 | "('x-smtpd', 1)\n", 1430 | "('x-nod32result', 1)\n", 1431 | "('x-spamdetails', 1)\n", 1432 | "('x-spamscore', 1)\n", 1433 | "('x-ucinetid', 1)\n", 1434 | "('x-winsocket-mailscanner-information', 1)\n", 1435 | "('x-winsocket-mailscanner', 1)\n", 1436 | "('x-guinevere', 1)\n", 1437 | "('x-hr-status', 1)\n", 1438 | "TOTAL EMAIL COUNT: 75419\n" 1439 | ] 1440 | }, 1441 | { 1442 | "data": { 1443 | "image/png": "\n", 1444 | "text/plain": [ 1445 | "
" 1446 | ] 1447 | }, 1448 | "metadata": { 1449 | "needs_background": "light" 1450 | }, 1451 | "output_type": "display_data" 1452 | } 1453 | ], 1454 | "source": [ 1455 | "dict_header_features = {}\n", 1456 | "total_emails = 0\n", 1457 | "\n", 1458 | "def getIndexMap(index_path, data_path):\n", 1459 | " index = {}\n", 1460 | " with open(index_path, encoding='us-ascii') as index_file:\n", 1461 | " for i, line in enumerate(index_file):\n", 1462 | " type = line[0:4]\n", 1463 | " file_path = line.split('/')[2][:-1]\n", 1464 | " if type == 'spam':\n", 1465 | " index[f'{data_path}{file_path}'] = 1\n", 1466 | " else:\n", 1467 | " index[f'{data_path}{file_path}'] = 0\n", 1468 | " return index\n", 1469 | "\n", 1470 | "\n", 1471 | "def addEmailToDf(file_path, index):\n", 1472 | " global dict_header_features, total_emails\n", 1473 | " #Dictionary, keys = columns values, values = None initially.\n", 1474 | " header = dict.fromkeys(columns)\n", 1475 | " \n", 1476 | " label = index[file_path]\n", 1477 | " \n", 1478 | " total_emails += 1\n", 1479 | " hops = 0\n", 1480 | " host = None\n", 1481 | " parser = HeaderParser()\n", 1482 | " emailStr = \"\"\n", 1483 | " \n", 1484 | " try:\n", 1485 | " with open(file_path, encoding='latin1') as emailFile:\n", 1486 | " for line in emailFile:\n", 1487 | " emailStr += line\n", 1488 | " except UnicodeDecodeError:\n", 1489 | " pass\n", 1490 | " \n", 1491 | " h = parser.parsestr(emailStr)\n", 1492 | " features_lower_case = [x.lower() for x in h.keys()]\n", 1493 | " \n", 1494 | " features_dict = Counter(features_lower_case)\n", 1495 | " \n", 1496 | " for k,v in features_dict.items():\n", 1497 | " if k in dict_header_features:\n", 1498 | " dict_header_features[k] += 1\n", 1499 | " else:\n", 1500 | " dict_header_features[k] = 1\n", 1501 | " \n", 1502 | "\n", 1503 | " \n", 1504 | "def main():\n", 1505 | " global dict_header_features, total_emails\n", 1506 | " \n", 1507 | " index = getIndexMap('trec07p/full/index', 'trec07p/data/')\n", 1508 | "\n", 1509 | " counter = 0\n", 1510 | " for emailFile in listdir('trec07p/data'):\n", 1511 | " counter += 1\n", 1512 | " if counter % 1000 == 0:\n", 1513 | " print(counter)\n", 1514 | " path = 'trec07p/data/' + emailFile\n", 1515 | " addEmailToDf(path, index)\n", 1516 | " \n", 1517 | " dict_header_features = sorted(dict_header_features.items(), key=lambda x:x[1], reverse=True)\n", 1518 | "\n", 1519 | " for v in dict_header_features:\n", 1520 | " print(v)\n", 1521 | " \n", 1522 | " unzipped = list(zip(*dict_header_features))\n", 1523 | " lab = unzipped[0]\n", 1524 | " val = unzipped[1]\n", 1525 | " print(\"TOTAL EMAIL COUNT:\", total_emails)\n", 1526 | " val = [x/total_emails for x in val]\n", 1527 | " figure(figsize=(15, 6))\n", 1528 | " axes = plt.gca()\n", 1529 | " axes.set_ylim([0,1])\n", 1530 | " plt.xticks(rotation='vertical')\n", 1531 | " plt.ylabel('Percent of total emails')\n", 1532 | " plt.bar(lab[0:20], val[0:20], width=0.25)\n", 1533 | " plt.grid()\n", 1534 | " #plt.savefig('most_common_header_features_latin1.png', bbox_inches=\"tight\")\n", 1535 | " plt.show()\n", 1536 | "\n", 1537 | "if __name__ == '__main__':\n", 1538 | " main()" 1539 | ] 1540 | } 1541 | ], 1542 | "metadata": { 1543 | "kernelspec": { 1544 | "display_name": "Python 3", 1545 | "language": "python", 1546 | "name": "python3" 1547 | }, 1548 | "language_info": { 1549 | "codemirror_mode": { 1550 | "name": "ipython", 1551 | "version": 3 1552 | }, 1553 | "file_extension": ".py", 1554 | "mimetype": "text/x-python", 1555 | "name": "python", 1556 | "nbconvert_exporter": "python", 1557 | "pygments_lexer": "ipython3", 1558 | "version": "3.8.5" 1559 | } 1560 | }, 1561 | "nbformat": 4, 1562 | "nbformat_minor": 4 1563 | } 1564 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Email Anomaly Detection 2 | 3 | This is a project to detect anomalies (i.e., spam, phishing) in email datasets using machine learning and features extracted from email headers. A total of 94 features are extracted. Both supervised and unsupervised learning is used, with supervised learning achieving 97-99%+ accuracy and unsupervised achieving 87-90% accuracy. 4 | 5 | Two datasets are used, which were obtained from: 6 | (1) https://plg.uwaterloo.ca/~gvcormac/treccorpus07/about.html 7 | (2) https://monkey.org/~jose/phishing/ 8 | 9 | The first dataset (1) contains both spam and ham emails, while the second dataset contains phishing emails. The first dataset was parsed using the 'Capstone project - Extract HamSpam.ipynb' notebook. The second dataset (2) was parsed using the 'Capstone Project - Extract Phishing.ipynb' notebook. The resulting .CSV files are all parsed by the 'Capstone Project - Preproccess + Feature Extraction.ipynb' notebook, which outputs the 'preprocessed_spam_ham_phishing.csv' file. The 'preprocessed_spam_ham_phishing.csv' file contains the information used during the training and testing of learning algorithms. 10 | 11 | The class labels in the preprocessed_spam_ham_phishing.csv are: 12 | 0: ham emails 13 | 1: spam emails 14 | 2: phishing emails 15 | 16 | Four rounds of testing were performed: 17 | 1) Supervised learning, ham and spam email classification 18 | 2) Supervised learning, ham and phishing email classification 19 | 3) Unsupervised learning, ham and spam email classification 20 | 4) Unsupervised learning, ham and phishing email classification --------------------------------------------------------------------------------