├── .gitignore
├── amico_scripts
    ├── logs
    │   └── README
    ├── .gitignore
    ├── manual_downloads
    │   └── README
    ├── weka.jar
    ├── models
    │   └── default.model
    ├── parsed
    │   ├── captured_files
    │   │   └── README
    │   └── raw_files
    │   │   └── README
    ├── stop_amico.sh
    ├── train_config.py
    ├── logging.conf
    ├── db_cleanup.py
    ├── postfile.py
    ├── etld.py
    ├── pe_extract.py
    ├── features.py
    ├── db_syslog.py
    ├── config.py.tmpl
    ├── update_urls_fix.py
    ├── vt_api.py
    ├── classify_dump.py
    ├── util.py
    ├── fe_db_setup.py
    ├── db_pe_dumps.py
    ├── manual_download.py
    ├── db_file_dumps.py
    ├── ip2asn.py
    ├── db_setup.py
    ├── db_virus_total.py
    ├── extract_file.py
    ├── start_amico.py
    ├── trainer.py
    ├── vt_submit.py
    └── get_feature_vector.py
├── file_dump
    ├── .gitignore
    ├── util
    │   ├── README
    │   ├── send_SIGTERM.sh
    │   ├── send_SIGUSR2.sh
    │   ├── send_SIGUSR1.sh
    │   ├── start.sh
    │   ├── set_cpu_affinity.sh
    │   ├── turn_offload_off.sh
    │   └── set_nic_irq_smp_affinity.sh
    ├── mac_strnlen.c
    ├── Makefile.valgrind
    ├── Makefile
    ├── start_file_dump.py
    ├── config.py.tmpl
    ├── README
    ├── seq_list.h
    ├── lru-cache.h
    ├── search.c
    ├── seq_list.c
    └── lru-cache.c
├── README.md
└── license.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.log
3 | 


--------------------------------------------------------------------------------
/amico_scripts/logs/README:
--------------------------------------------------------------------------------
1 | Amico error and debug logs directory
2 | 


--------------------------------------------------------------------------------
/amico_scripts/.gitignore:
--------------------------------------------------------------------------------
1 | config.py
2 | parsed/
3 | manual_downloads/
4 | 


--------------------------------------------------------------------------------
/amico_scripts/manual_downloads/README:
--------------------------------------------------------------------------------
1 | Stores "manually" re-downloaded files
2 | 


--------------------------------------------------------------------------------
/file_dump/.gitignore:
--------------------------------------------------------------------------------
1 | config.py
2 | file_dump
3 | *.o
4 | *.log
5 | *.log.*
6 | dumps/
7 | 


--------------------------------------------------------------------------------
/amico_scripts/weka.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/perdisci/amico/HEAD/amico_scripts/weka.jar


--------------------------------------------------------------------------------
/amico_scripts/models/default.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/perdisci/amico/HEAD/amico_scripts/models/default.model


--------------------------------------------------------------------------------
/amico_scripts/parsed/captured_files/README:
--------------------------------------------------------------------------------
1 | This directory contains all captured files (extracted from TCP flows)
2 | 


--------------------------------------------------------------------------------
/file_dump/util/README:
--------------------------------------------------------------------------------
1 | These scripts will need to be slightly edited to adapt them to your own system setup and needs
2 | 


--------------------------------------------------------------------------------
/amico_scripts/parsed/raw_files/README:
--------------------------------------------------------------------------------
1 | This directory contains the raw TCP flow reconstruction containing the desired captured file
2 | 


--------------------------------------------------------------------------------
/amico_scripts/stop_amico.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for i in $(ps ux | grep start_amico.py | grep -v grep | awk '{print $2}'); do 
4 |     kill $i; 
5 | done
6 | 


--------------------------------------------------------------------------------
/file_dump/util/send_SIGTERM.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for i in $(ps aux | grep file_dump | grep -v python | grep -v sudo | grep -v postgres | grep -v grep | awk '{print $2}'); do 
4 | 	sudo kill -SIGTERM $i; 
5 | done
6 | 


--------------------------------------------------------------------------------
/file_dump/util/send_SIGUSR2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for i in $(ps aux | grep file_dump | grep -v python | grep -v sudo | grep -v postgres | grep -v grep | awk '{print $2}'); do 
4 | 	sudo kill -SIGUSR2 $i; 
5 | done
6 | 


--------------------------------------------------------------------------------
/file_dump/util/send_SIGUSR1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for i in $(pgrep file_dump); do 
 4 | 	sudo kill -SIGUSR1 $i; 
 5 | done
 6 | 
 7 | for i in $(ls zc98_*.log); do 
 8 | 	tail $i | egrep "(dropped|received)"; 
 9 | done
10 | 


--------------------------------------------------------------------------------
/file_dump/util/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # launch as ./util/start.sh
 4 | 
 5 | for i in {0..8}; do 
 6 |     python start_file_dump.py "zc:99@$i" >& zc99_$i.log & 
 7 | done
 8 | 
 9 | sleep 1
10 | 
11 | ./util/set_cpu_affinity.sh 2
12 | 


--------------------------------------------------------------------------------
/file_dump/mac_strnlen.c:
--------------------------------------------------------------------------------
 1 | #ifdef __APPLE__
 2 | 
 3 | #include <string.h>
 4 | 
 5 | size_t strnlen(const char *s, size_t n) {
 6 |     int i;
 7 | 
 8 |     for(i=0; i<n; i++)
 9 |         if(s[i]=='\0')
10 |             return i+1;
11 | 
12 |     return n;
13 | }
14 | 
15 | #endif
16 | 
17 | 


--------------------------------------------------------------------------------
/file_dump/util/set_cpu_affinity.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # CPU affinity for pe_dump processes
 4 | i=$1
 5 | for pid in $(ps aux | egrep "\./file_dump" | grep -v sudo | grep -v "bin/sh" | grep -v grep | awk '{print $2}') 
 6 | do 
 7 |     taskset -c -p $i $pid 
 8 |     let 'i=i+1' 
 9 | done
10 | 


--------------------------------------------------------------------------------
/file_dump/Makefile.valgrind:
--------------------------------------------------------------------------------
 1 | OBJ = file_dump.o search.o lru-cache.o seq_list.o
 2 | CFLAGS = -Wall -g -O0
 3 | CC = gcc
 4 | INCLUDE =
 5 | LIB = -lpcap -lpthread -lpfring
 6 | 
 7 | file_dump: ${OBJ}
 8 | 	${CC} ${CFLAGS} ${INCLUDES} -o $@ ${OBJ} ${LIB}
 9 | 
10 | clean:
11 | 	rm -f *.o file_dump 
12 | 


--------------------------------------------------------------------------------
/file_dump/Makefile:
--------------------------------------------------------------------------------
 1 | OBJ = file_dump.o search.o lru-cache.o seq_list.o
 2 | CFLAGS = -Wall -O3
 3 | CC = gcc
 4 | INCLUDE =
 5 | LIB = -L /usr/local/lib/ -lpcap -lpthread -lpfring
 6 | 
 7 | file_dump: ${OBJ}
 8 | 	${CC} ${CFLAGS} ${INCLUDES} -o $@ ${OBJ} ${LIB}
 9 | 
10 | clean:
11 | 	rm -f *.o file_dump 
12 | 


--------------------------------------------------------------------------------
/file_dump/util/turn_offload_off.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # run as root...
 4 | 
 5 | NIC=$1
 6 | 
 7 | ifconfig $NIC up
 8 | # ethtool -G $NIC rx 4096
 9 | 
10 | ethtool -K $NIC tso off
11 | ethtool -K $NIC gro off
12 | ethtool -K $NIC lro off
13 | ethtool -K $NIC gso off
14 | ethtool -K $NIC rx off
15 | ethtool -K $NIC tx off
16 | ethtool -K $NIC sg off
17 | 
18 | 


--------------------------------------------------------------------------------
/amico_scripts/train_config.py:
--------------------------------------------------------------------------------
 1 | # Example: training_start_date = "2014-02-30"
 2 | # If left as None, all the download events from the beginning are
 3 | # used for training the classifier
 4 | # It is recommended to leave atleast 30 days for bootstrapping i.e.
 5 | # say you started running AMICO from 2014-01-20, then the training
 6 | # start date should be later than 2014-02-20
 7 | training_start_date = "2012-08-01"
 8 | # If left as None, all the download events till the end are used
 9 | # for training the classifier
10 | training_days = 60
11 | 


--------------------------------------------------------------------------------
/file_dump/util/set_nic_irq_smp_affinity.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # NIC IRQ affinity
 4 | # show IRQs for network interfaces
 5 | # cat /proc/interrupts | egrep "eth[0-9]+-Tx" 
 6 | 
 7 | ### the IRQ numbers need to be replaced with the correct ones!
 8 | sudo echo 00000100 > /proc/irq/119/smp_affinity
 9 | sudo echo 00000200 > /proc/irq/121/smp_affinity
10 | sudo echo 00000400 > /proc/irq/123/smp_affinity
11 | sudo echo 00000800 > /proc/irq/125/smp_affinity
12 | sudo echo 00001000 > /proc/irq/128/smp_affinity
13 | sudo echo 00002000 > /proc/irq/130/smp_affinity
14 | 


--------------------------------------------------------------------------------
/amico_scripts/logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root, amico_logger
 3 | 
 4 | [handlers]
 5 | keys=fileDebugHandler,fileErrorHandler
 6 | 
 7 | [formatters]
 8 | keys=simpleFormatter
 9 | 
10 | [logger_root]
11 | level=ERROR
12 | handlers=
13 | 
14 | [logger_amico_logger]
15 | level=DEBUG
16 | handlers=fileDebugHandler,fileErrorHandler
17 | qualname=amico_logger
18 | propagate=0
19 | 
20 | [handler_fileErrorHandler]
21 | class=logging.handlers.RotatingFileHandler
22 | level=WARNING
23 | formatter=simpleFormatter
24 | args=("logs/amico_error.log", 'a', (5*1024*1024), 5)
25 | 
26 | 
27 | [handler_fileDebugHandler]
28 | class=logging.handlers.RotatingFileHandler
29 | level=DEBUG
30 | formatter=simpleFormatter
31 | args=("logs/amico_debug.log", 'a', (5*1024*1024), 5)
32 | 
33 | [formatter_simpleFormatter]
34 | format=%(asctime)s - %(levelname)s - %(message)s
35 | 


--------------------------------------------------------------------------------
/file_dump/start_file_dump.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | from config import whitelist_subnets, manual_download_ip
 4 | 
 5 | 
 6 | def print_usage():
 7 |     print "Usage: sudo python start_pe_dump.py ethX"
 8 |     sys.exit()
 9 | 
10 | if len(sys.argv) < 2:
11 |     print_usage()
12 | nic = sys.argv[1]
13 | 
14 | bpf_filter = "\"tcp"
15 | 
16 | if len(whitelist_subnets) > 0:
17 |     bpf_filter += " and not ("
18 |     for subnet in whitelist_subnets:
19 |         bpf_filter += "net %s or " % (subnet,)
20 |     bpf_filter = bpf_filter[:-4]
21 |     bpf_filter += ")"
22 | 
23 | if len(manual_download_ip) > 0:
24 |     bpf_filter += " and not net %s" % (manual_download_ip,)
25 | 
26 | bpf_filter += "\""
27 | 
28 | subprocess.call("""
29 |         ./file_dump -i %s -d dumps/ -A -J -G -f %s """ %
30 |             (nic, bpf_filter), shell=True)
31 | 


--------------------------------------------------------------------------------
/file_dump/config.py.tmpl:
--------------------------------------------------------------------------------
 1 | # The IP from which the manual downloads are happening should be listed here to
 2 | # prevent infinite download loops. 
 3 | manual_download_ip = ""
 4 | 
 5 | # The subnets hosting popular benign websites like Facebook, Google, MSN are
 6 | # listed here and is fed to a BPF filter. This reduces the traffic load on
 7 | # PE_DUMP
 8 | whitelist_subnets = [ 
 9 |             "69.171.224.0/20",
10 |             "66.220.152.0/21",
11 |             "74.125.0.0/16",
12 |             "220.181.111.0/24",
13 |             "123.125.114.0/24",
14 |             "199.59.148.0/22",
15 |             "65.54.94.0/23",
16 |             "65.55.160.0/19",
17 |             "65.55.192.0/18",
18 |             "66.135.192.0/19",
19 |             "157.166.224.0/20",
20 |             "15.192.0.0/16",
21 |             "143.166.0.0/17",
22 |             "17.148.0.0/14",
23 |             "192.150.16.0/23"]
24 | 


--------------------------------------------------------------------------------
/amico_scripts/db_cleanup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | ###########################################################################
 4 | # Copyright (C) 2011 Phani Vadrevu                                        #
 5 | # pvadrevu@uga.edu                                                        #
 6 | #                                                                         #
 7 | # Distributed under the GNU Public License                                #
 8 | # http://www.gnu.org/licenses/gpl.txt                                     #   
 9 | #                                                                         #
10 | # This program is free software; you can redistribute it and/or modify    #
11 | # it under the terms of the GNU General Public License as published by    #
12 | # the Free Software Foundation; either version 2 of the License, or       #
13 | # (at your option) any later version.                                     #
14 | #                                                                         #
15 | ###########################################################################
16 | 
17 | import psycopg2
18 | import config
19 | from config import *
20 | 
21 | # Connect to database
22 | try:
23 |     conn = psycopg2.connect("dbname=%s host=%s user=%s password=%s"
24 |         %(db_name,db_host,db_user,db_password))
25 | except:
26 |     print "Unable to connect to database: "+db_name
27 | 
28 | # Use Autocommit mode for database connection
29 | conn.set_isolation_level(0)
30 | cursor = conn.cursor()
31 | 
32 | cursor.execute("""DROP TABLE IF EXISTS pe_dumps,virus_total_scans,
33 |         ped_vts_mapping, manual_download_checksums,bgp2asn,
34 |         weka_features, virus_total_submissions, amico_scores CASCADE""")
35 | print """Dropped the tables: pe_dumps,virus_total_scans,domain_whitelist,
36 |         manual_download_checksums,bgp2asn, virus_total_submissions,
37 |         amico_scores, weka_features, ped_vts_mapping"""
38 | cursor.close()
39 | conn.close()
40 | 


--------------------------------------------------------------------------------
/amico_scripts/postfile.py:
--------------------------------------------------------------------------------
 1 | import httplib, mimetypes
 2 | 
 3 | 
 4 | def post_multipart(host, selector, fields, files):
 5 |     """
 6 |     Post fields and files to an http host as multipart/form-data.
 7 |     fields is a sequence of (name, value) elements for regular form fields.
 8 |     files is a sequence of (name, filename, value) elements for data to be uploaded as files
 9 |     Return the server's response page.
10 |     """
11 |     content_type, body = encode_multipart_formdata(fields, files)
12 |     h = httplib.HTTPSConnection(host, timeout=120)
13 |     h.putrequest('POST', selector)
14 |     h.putheader('content-type', content_type)
15 |     h.putheader('content-length', str(len(body)))
16 |     h.endheaders()
17 |     h.send(body)
18 |     return h.getresponse().read()
19 | 
20 | 
21 | def encode_multipart_formdata(fields, files):
22 |     """
23 |     fields is a sequence of (name, value) elements for regular form fields.
24 |     files is a sequence of (name, filename, value) elements for data to be uploaded as files
25 |     Return (content_type, body) ready for httplib.HTTP instance
26 |     """
27 |     BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
28 |     CRLF = '\r\n'
29 |     L = []
30 |     for (key, value) in fields:
31 |         L.append('--' + BOUNDARY)
32 |         L.append('Content-Disposition: form-data; name="%s"' % key)
33 |         L.append('')
34 |         L.append(value)
35 |     for (key, filename, value) in files:
36 |         L.append('--' + BOUNDARY)
37 |         L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
38 |         L.append('Content-Type: %s' % get_content_type(filename))
39 |         L.append('')
40 |         L.append(value)
41 |     L.append('--' + BOUNDARY + '--')
42 |     L.append('')
43 |     body = CRLF.join(L)
44 |     content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
45 |     return content_type, body
46 | 
47 | 
48 | def get_content_type(filename):
49 |     return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
50 | 


--------------------------------------------------------------------------------
/file_dump/README:
--------------------------------------------------------------------------------
 1 | 
 2 | Usage: ./file_dump [-i NIC] [-r pcap_file] [-A] -d dump_dir [-f "pcap_filter"] [-L lru_cache_size] [-K max_dump_file_size (KB)] [-D debug_level] 
 3 | 
 4 |          -i : Use to specify network interface (e.g., -i eth0)
 5 |          -r : Read from .pcap file instead of NIC (e.g., -r file.pcap)
 6 |          -A : If specified, this flag will turn off the on-the-fly srcIP anonymization
 7 |          -d : Director where raw HTTP respnoses containing reconstructed files are stored (e.g., -d ./dumps
 8 |          -f : Specify BPF filter (e.g., -f "tcp port 80")
 9 |          -L : Change LRU cache size (default = 10000 entries)
10 |          -K : Change max accepted reconstructed file size, in KB (e.g., -K 1024)
11 |          -D : Specify debug_level (value from 0-4)
12 |          -J : extract JAR/APK files
13 |          -E : extract ELF files
14 |          -G : extract DMG files
15 |          -Z : extract ZIP files
16 |          -R : extract RAR files
17 |          -P : extract PDF files
18 |          -M : extract MS DOC files
19 | 
20 | ====
21 | 
22 | NOTE: the IPs below have not been updated in a while...
23 | 
24 | Suggested BPF filter to reduce load on file_dump packet analysis:
25 | 
26 | BPF FILTER = tcp and not (net 69.171.224.0/20 or net 66.220.152.0/21 or net 74.125.0.0/16 or net 220.181.111.0/24 or net 123.125.114.0/24 or net 199.59.148.0/22 or net 65.54.94.0/23 or net 65.55.160.0/19 or net 65.55.192.0/18 or net 66.135.192.0/19 or net 157.166.224.0/20 or net 15.192.0.0/16 or net 143.166.0.0/17 or net 17.148.0.0/14 or net 192.150.16.0/23)
27 | 
28 | 
29 | Facebook
30 | 69.171.224.0/20
31 | 66.220.152.0/21
32 | 
33 | Google/Youtube
34 | 74.125.0.0/16
35 | 
36 | Baidu
37 | 220.181.111.0/24 
38 | 123.125.114.0/24
39 | 
40 | Twitter
41 | 199.59.148.0/22
42 | 
43 | Microsoft/MSN/Live.com/Bing
44 | 65.54.94.0/23
45 | 65.55.160.0/19
46 | 65.55.192.0/18
47 | 
48 | Ebay
49 | 66.135.192.0/19
50 | 
51 | CNN
52 | 157.166.224.0/20
53 | 
54 | HP
55 | 15.192.0.0/16
56 | 
57 | DELL
58 | 143.166.0.0/17
59 | 
60 | APPLE
61 | 17.148.0.0/14
62 | 
63 | ADOBE
64 | 192.150.16.0/23
65 | 
66 | 


--------------------------------------------------------------------------------
/file_dump/seq_list.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *   Copyright (C) 2011  Roberto Perdisci (perdisci@cs.uga.edu)
 3 |  *
 4 |  *   This program is free software: you can redistribute it and/or modify
 5 |  *   it under the terms of the GNU General Public License as published by
 6 |  *   the Free Software Foundation, either version 3 of the License, or
 7 |  *   (at your option) any later version.
 8 |  *
 9 |  *   This program is distributed in the hope that it will be useful,
10 |  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |  *   GNU General Public License for more details.
13 |  *
14 |  *   You should have received a copy of the GNU General Public License
15 |  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | 
18 | #include <stdio.h>
19 | #include <stdlib.h>
20 | #include <string.h>
21 | 
22 | typedef unsigned int u_int;
23 | 
24 | 
25 | //////////////////////////////////////////////////////////////
26 | // This is an implementation of a siple list that holds pairs:
27 | // (sequence_number, payload_size)
28 | //////////////////////////////////////////////////////////////
29 | 
30 | typedef struct seq_list_entry {
31 | 
32 |     u_int sn; // sequence number
33 |     u_int ps; // payload size 
34 |     struct seq_list_entry *next;
35 | 
36 | } seq_list_entry_t;
37 | 
38 | typedef struct seq_list {
39 | 
40 |     seq_list_entry_t *head;
41 |     seq_list_entry_t *tail;
42 |     seq_list_entry_t *next;
43 | 
44 | } seq_list_t;
45 | 
46 | seq_list_t* seq_list_init(void);
47 | void seq_list_destroy(seq_list_t* l, int mz_found);
48 | void seq_list_insert(seq_list_t *l, u_int i, u_int j);
49 | seq_list_entry_t *seq_list_head(seq_list_t *l);
50 | seq_list_entry_t *seq_list_tail(seq_list_t *l);
51 | seq_list_entry_t *seq_list_next(seq_list_t *l);
52 | void seq_list_restart_from_head(seq_list_t *l);
53 | void seq_list_restart_from_element(seq_list_t *l, seq_list_entry_t *e);
54 | u_int seq_list_get_seq_num(seq_list_entry_t *e);
55 | u_int seq_list_get_payload_size(seq_list_entry_t *e);
56 | void seq_list_print(seq_list_t *l);
57 | 
58 | 


--------------------------------------------------------------------------------
/amico_scripts/etld.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # Copyright (c) 2009 Michael Still
 4 | # Released under the terms of the GNU GPL v2
 5 | 
 6 | # Mozilla publishes a rule file which may be used to calculate effective TLDs
 7 | # at:
 8 | #
 9 | #   http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/src/
10 | #   effective_tld_names.dat?raw=1
11 | #
12 | # Use that file to take a domain name and return a (domain, etld) tuple.
13 | # Documentation for the rule file format is at:
14 | #
15 | #   https://wiki.mozilla.org/Gecko:Effective_TLD_Service
16 | 
17 | import re
18 | import sys
19 | import time
20 | 
21 | class etld(object):
22 |   """Helper to determine the effective TLD portion of a domain name."""
23 | 
24 |   def __init__(self, datafile='effective_tld_names.dat'):
25 |     """Load the data file ready for lookups."""
26 | 
27 |     self.rules = {}
28 | 
29 |     file = open(datafile)
30 |     line = file.readline()
31 |     while line:
32 |       line = line.rstrip()
33 |       if line and not line.startswith('//'):
34 |         tld = line.split('.')[-1]
35 |         self.rules.setdefault(tld, [])
36 |         self.rules[tld].append(re.compile(self.regexpize(line)))
37 | 
38 |       line = file.readline()
39 |     file.close()
40 | 
41 |   def regexpize(self, line):
42 |     """Convert a rule to regexp syntax."""
43 | 
44 |     line = line[::-1].replace('.', '\\.').replace('*', '[^\\.]*').replace('!', '')
45 |     return '^(%s)\.(.*)$' % line
46 | 
47 |   def parse(self, hostname):
48 |     """Parse a hostanme into domain and etld portions."""
49 | 
50 |     hostname = hostname.lower()
51 |     tld = hostname.split('.')[-1]
52 |     hostname = hostname[::-1]
53 |     domain = ''
54 |     etld = ''
55 | 
56 |     for rule in self.rules[tld]:
57 |       m = rule.match(hostname)
58 |       if m and m.group(1) > etld:
59 |         domain = m.group(2)[::-1]
60 |         etld = m.group(1)[::-1]
61 | 
62 |     if not etld:
63 |       raise Exception('Parse failed')
64 | 
65 |     return (domain, etld)
66 | 
67 | 
68 | if __name__ == '__main__':
69 |   e = etld()
70 | 
71 |   f = open(sys.argv[1])
72 |   l = f.readline()
73 |   start_time = time.time()
74 | 
75 |   while l:
76 |     try:
77 |       l = l.rstrip()
78 |       print '%s -> %s' %(l, e.parse(l))
79 |     except Exception, ex:
80 |       print ex
81 | 
82 |     l = f.readline()
83 |   
84 |   print 'Took %f seconds' % (time.time() - start_time)
85 |   f.close()
86 | 


--------------------------------------------------------------------------------
/amico_scripts/pe_extract.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | ###########################################################################
 4 | # Copyright (C) 2011 Roberto Perdisci                                     #
 5 | # perdisci@cs.uga.edu                                                     #
 6 | #                                                                         #
 7 | # Distributed under the GNU Public License                                #
 8 | # http://www.gnu.org/licenses/gpl.txt                                     #   
 9 | #                                                                         #
10 | # This program is free software; you can redistribute it and/or modify    #
11 | # it under the terms of the GNU General Public License as published by    #
12 | # the Free Software Foundation; either version 2 of the License, or       #
13 | # (at your option) any later version.                                     #
14 | #                                                                         #
15 | ###########################################################################
16 | 
17 | import sys, os
18 | import re
19 | from struct import unpack
20 | 
21 | def prune_http_resp_headers(data):
22 |     # this makes sure we find the actual start of the PE and not a random match
23 |     m = re.search('\s\sMZ', data)
24 |     if m:
25 |         pos = m.start()
26 |         data = data[pos:]
27 |         
28 |     # now we can start copyting data from MZ to the end
29 |     m = re.search('MZ',data)
30 |     if m:
31 |         pos = m.start()
32 |         return data[pos:]
33 | 
34 | def is_pe_file(bin_data):
35 | 
36 |     if not bin_data:
37 |         return False
38 | 
39 |     if len(bin_data) <= 0:
40 |         return False
41 | 
42 |     m = re.search('MZ', bin_data)
43 |     if m:
44 |         p = m.start()
45 |         offset = p + unpack('i', bin_data[p+0x3c:p+0x3c+4])[0]
46 |         # print "p=", p, "  offset=", offset
47 |         if bin_data[p:p+2] == 'MZ' and bin_data[offset:offset+2] == 'PE':
48 |             # print "This is a PE file!"
49 |             return True
50 | 
51 |     print "This is NOT a PE file!"
52 |     return False
53 | 
54 | 
55 | def usage():
56 |     print >> sys.stderr, 'usage: %s [-i device] [-r file] [pcap filter]' % sys.argv[0]
57 |     sys.exit(1)
58 | 
59 | 
60 | def pe_extract(flow_file, dst=None):
61 |     if not dst:
62 |         dst = flow_file + '.exe'
63 |     f = open(flow_file, 'rb')
64 |     data = f.read()
65 |     f.close()
66 | 
67 |     data = prune_http_resp_headers(data)
68 | 
69 |     if is_pe_file(data):
70 |         print "Writing file:", flow_file+'.exe'
71 |         f = open(dst, 'wb')
72 |         f.write(data)
73 |         f.close()
74 |         return True
75 | 
76 |     print "Finished!"
77 |     return False
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     pe_extract(sys.argv[1])
82 | 


--------------------------------------------------------------------------------
/file_dump/lru-cache.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *   This is an implementation of a O(1) LRU cache.
 3 |  *   Copyright (C) 2010  Roberto Perdisci (perdisci@cs.uga.edu)
 4 |  *
 5 |  *   This program is free software: you can redistribute it and/or modify
 6 |  *   it under the terms of the GNU General Public License as published by
 7 |  *   the Free Software Foundation, either version 3 of the License, or
 8 |  *   (at your option) any later version.
 9 |  *
10 |  *   This program is distributed in the hope that it will be useful,
11 |  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |  *   GNU General Public License for more details.
14 |  *
15 |  *   You should have received a copy of the GNU General Public License
16 |  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 |  */
18 | 
19 | #include <stdio.h>
20 | #include <stdlib.h>
21 | #include <string.h>
22 | 
23 | #define MAX_LRUC_TTL 5*60 // 5 minutes
24 | 
25 | typedef unsigned int u_int;
26 | 
27 | typedef struct ht_entry {
28 | 
29 |     const char *key;
30 |     struct lruc_entry *le;
31 |     struct ht_entry *next;
32 | 
33 | } ht_entry_t;
34 | 
35 | typedef struct hash_table {
36 | 
37 |     u_int length;
38 |     ht_entry_t **vect;
39 | 
40 | } hash_table_t;
41 | 
42 | typedef struct lruc_entry {
43 | 
44 |     char *key;
45 |     void *value;
46 |     time_t time;
47 |     struct lruc_entry *prev;
48 |     struct lruc_entry *next;
49 | 
50 | } lruc_entry_t;
51 | 
52 | typedef struct lru_cache {
53 | 
54 |     hash_table_t *ht;  // pointer to the Hash Table for O(1) searches
55 |     lruc_entry_t *top; // pointer to the pot of the LRU cache
56 |     void (*destroy_val_fn)(void*); // callback function for destroying an entry value
57 | 
58 |     u_int num_entries;
59 |     u_int max_entries;
60 | 
61 | } lru_cache_t;
62 | 
63 | hash_table_t* ht_init(u_int length);
64 | lru_cache_t* lruc_init_str(u_int max_entries);
65 | lru_cache_t* lruc_init(u_int max_entries, void (*destroy_val_fn)(void*));
66 | 
67 | void ht_insert(hash_table_t *ht, lruc_entry_t *lruc_e, const char *key);
68 | void ht_delete(hash_table_t *ht, const char *key);
69 | void ht_destroy(hash_table_t* ht);
70 | int lruc_insert_str(lru_cache_t *lruc, const char *key, const char* value);
71 | int lruc_insert(lru_cache_t *lruc, const char *key, void* value);
72 | void lruc_delete(lru_cache_t *lruc, const char *key);
73 | void lruc_destroy(lru_cache_t *lruc);
74 | 
75 | lruc_entry_t* ht_search(const hash_table_t *ht, const char *key);
76 | char* lruc_search_str(lru_cache_t *lruc, const char *key);
77 | void* lruc_search(lru_cache_t *lruc, const char *key);
78 | 
79 | u_int hash_fn(const char* key);
80 | u_int DJBHash(const char* str, u_int len);
81 | 
82 | void print_ht(hash_table_t *ht);
83 | void print_lruc(lru_cache_t *lruc);
84 | void clean_lruc(lru_cache_t *lruc);
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/amico_scripts/features.py:
--------------------------------------------------------------------------------
 1 | ###########################################################################
 2 | # Copyright (C) 2013 Phani Vadrevu                                        #
 3 | # pvadrevu@uga.edu                                                        #
 4 | #                                                                         #
 5 | # Distributed under the GNU Public License                                #
 6 | # http://www.gnu.org/licenses/gpl.txt                                     #
 7 | #                                                                         #
 8 | # This program is free software; you can redistribute it and/or modify    #
 9 | # it under the terms of the GNU General Public License as published by    #
10 | # the Free Software Foundation; either version 2 of the License, or       #
11 | # (at your option) any later version.                                     #
12 | #                                                                         #
13 | ###########################################################################
14 | features = (
15 | ["dump_id",
16 | "vt_month_shelf",
17 | "corrupt",
18 | "host_malware_downloads", 
19 | "host_suspicious_downloads", 
20 | "host_benign_downloads", 
21 | "host_total_downloads", 
22 | "host_malware_ratio", 
23 | "host_suspicious_ratio", 
24 | "host_benign_ratio", 
25 | "host_avg_av_labels", 
26 | "host_avg_trusted_labels", 
27 | "host_unknown_hashes", 
28 | "host_total_hashes", 
29 | "host_unknown_hash_ratio", 
30 | "twold_malware_downloads", 
31 | "twold_suspicious_downloads", 
32 | "twold_benign_downloads", 
33 | "twold_total_downloads", 
34 | "twold_malware_ratio", 
35 | "twold_suspicious_ratio", 
36 | "twold_benign_ratio", 
37 | "twold_avg_av_labels", 
38 | "twold_avg_trusted_labels", 
39 | "twold_unknown_hashes", 
40 | "twold_total_hashes", 
41 | "twold_unknown_hash_ratio", 
42 | "server_ip_malware_downloads", 
43 | "server_ip_suspicious_downloads", 
44 | "server_ip_benign_downloads", 
45 | "server_ip_total_downloads", 
46 | "server_ip_malware_ratio", 
47 | "server_ip_suspicious_ratio", 
48 | "server_ip_benign_ratio", 
49 | "server_ip_avg_av_labels", 
50 | "server_ip_avg_trusted_labels", 
51 | "server_ip_unknown_hashes", 
52 | "server_ip_total_hashes", 
53 | "server_ip_unknown_hash_ratio", 
54 | "bgp_malware_downloads", 
55 | "bgp_suspicious_downloads", 
56 | "bgp_benign_downloads", 
57 | "bgp_total_downloads", 
58 | "bgp_malware_ratio", 
59 | "bgp_suspicious_ratio", 
60 | "bgp_benign_ratio", 
61 | "bgp_avg_av_labels", 
62 | "bgp_avg_trusted_labels", 
63 | "bgp_unknown_hashes", 
64 | "bgp_total_hashes", 
65 | "bgp_unknown_hash_ratio", 
66 | "hash_life_time", 
67 | "num_dumps_with_same_hash", 
68 | "hash_daily_dump_rate_per_client", 
69 | "estimated_clients_with_same_hash", 
70 | "referer_exists", 
71 | "host_name_exists", 
72 | "extension_class",
73 | "url_length", 
74 | "directory_depth", 
75 | "sha1", 
76 | "host", 
77 | "url_malware_downloads", 
78 | "url_total_downloads", 
79 | "url_distinct_sha1s", 
80 | "url_struct", 
81 | "url_struct_malware_downloads", 
82 | "url_struct_total_downloads", 
83 | "url_struct_distinct_sha1s"])
84 | 


--------------------------------------------------------------------------------
/amico_scripts/db_syslog.py:
--------------------------------------------------------------------------------
 1 | ##########################################################################
 2 | # copyright (c) 2011 phani vadrevu                                        #
 3 | # pvadrevu@uga.edu                                                        #
 4 | #                                                                         #
 5 | # Distributed under the GNU Public License                                #
 6 | # http://www.gnu.org/licenses/gpl.txt                                     #
 7 | #                                                                         #
 8 | # This program is free software; you can redistribute it and/or modify    #
 9 | # it under the terms of the GNU General Public License as published by    #
10 | # the Free Software Foundation; either version 2 of the License, or       #
11 | # (at your option) any later version.                                     #
12 | #                                                                         #
13 | ###########################################################################
14 | 
15 | import sys
16 | from config import amico_threshold
17 | import syslog
18 | import time
19 | 
20 | import util
21 | from util import reorder_domain
22 | 
23 | # Wait for db_virus_total to complete
24 | WAIT_TIME = 60
25 | 
26 | 
27 | def make_syslog_entry(cursor, dump_id, score):
28 |     # Database query to get the relevant record
29 |     cursor.execute("""
30 |         SELECT timestamp, client, server, dst_port, host, url, referer,
31 |             pe.sha1, pe.md5, file_size, num_av_labels, corrupt, file_type
32 |         FROM pe_dumps as pe LEFT JOIN virus_total_scans as vts USING(sha1)
33 |         WHERE (corrupt = 'false' OR num_av_labels IS NOT NULL) AND
34 |             dump_id = %s
35 |         ORDER BY vts.query_time DESC
36 |         """ % (dump_id,))
37 |     if cursor.rowcount == 0:
38 |         return
39 |     log_data = list(cursor.fetchone())
40 |     log_data[4] = reorder_domain(log_data[4])
41 | 
42 |     # if a score!=None is passed as argument, use that score, otherwise retrieve it from DB
43 |     report = "-"
44 | 
45 |     if score is not None:
46 |         score = float(score) # just to make sure we are dealing with real numbers and not a string ...
47 |         if score > amico_threshold:
48 |             report = "MALWARE"
49 |         else:
50 |             report = "BENIGN"
51 |         report += "#%s#%s" % (score, amico_threshold)
52 | 
53 |     log_data.append(report)
54 | 
55 |     if log_data:
56 |         #print log_data
57 |         entry = ("file download -- timestamp: %s, client_ip: %s, server_ip:"
58 |         " %s, server_port: %s, host: %s, url: %s, referrer: %s, sha1: %s, md5:"
59 |         " %s, file_size: %s, av_labels: %s, corrupt: %s, file_type: %s, amico_score: %s" %
60 |             tuple(log_data))
61 |         #     syslog.syslog(syslog.LOG_ALERT,q)
62 |         syslog.syslog(syslog.LOG_WARNING | syslog.LOG_USER, entry)
63 | 
64 | 
65 | def db_syslog(dump_id,score):
66 |     time.sleep(WAIT_TIME)
67 |     conn = util.connect_to_db()
68 |     cursor = conn.cursor()
69 |     make_syslog_entry(cursor, dump_id, score)
70 |     cursor.close()
71 |     conn.close()
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     dump_id = sys.argv[1]
76 |     score = float(sys.argv[2])
77 |     db_syslog(dump_id,score)
78 | 


--------------------------------------------------------------------------------
/amico_scripts/config.py.tmpl:
--------------------------------------------------------------------------------
 1 | ##########################################################################
 2 | # Copyright (C) 2011 Phani Vadrevu                                        #
 3 | # pvadrevu@uga.edu                                                        #
 4 | #                                                                         #
 5 | # Distributed under the GNU Public License                                #
 6 | # http://www.gnu.org/licenses/gpl.txt                                     #   
 7 | #                                                                         #
 8 | # This program is free software; you can redistribute it and/or modify    #
 9 | # it under the terms of the GNU General Public License as published by    #
10 | # the Free Software Foundation; either version 2 of the License, or       #
11 | # (at your option) any later version.                                     #
12 | #                                                                         #
13 | ###########################################################################
14 | 
15 | # Postgres DB Info
16 | db_host = "localhost"
17 | db_name = ""
18 | db_user = ""
19 | db_password = ""
20 | 
21 | # VirusTotal API Keys, as a list of Python Strings eg: ["abcd", "efgh"]
22 | # Get your VT API key at: https://www.virustotal.com/en/
23 | vt_keys = []
24 | 
25 | # "live", "manual" or None
26 | # If vt_submission = "manual", fill the manual_download_ip variable in
27 | # pe_dump/config.py
28 | vt_submissions = "live"
29 | vt_submissions_ext = ['exe','apk','dmg','jar']
30 | 
31 | trusted_av_vendors = ["Avast", "AVG", "F-Secure", "Kaspersky", "McAfee",
32 |                     "Microsoft", "Sophos", "Symantec", "TrendMicro"]
33 | 
34 | 
35 | # file types that should be captured and stored on disk
36 | # see extract_file.py for available types
37 | capture_file_types = ["EXE"]
38 | 
39 | 
40 | # Manual downloads are only enabled if vt_submissions = "manual"
41 | # Every time a download is detected using the URL from which the executable has
42 | # been downloaded, a new HTTP request is created and the file is refetched from
43 | # the same webserver. We refer to this as a manual download. Please specify the
44 | # directory where these manual downloads should be saved
45 | MAN_DOWNLOAD_DIR = "manual_downloads"
46 | LIVE_DOWNLOAD_DIR = "parsed/captured_files"
47 | 
48 | # parameters for feature compuation based on past activity
49 | MAX_PAST_DUMPS = 100000 # used to only go back up to the past MAX_PAST_DUMPS dumps
50 | MAX_PAST_DAYS  = 30 # go back in time MAX_PAST_DAYS days max
51 | 
52 | # When running AMICO under a SOCKS proxy, use these.
53 | # Else, have socks_proxy_host as "None"
54 | #socks_proxy_host = "localhost"
55 | socks_proxy_host = None
56 | socks_proxy_port = 12345
57 | 
58 | 
59 | # The threshold value for classification between (0,1)
60 | # used in db_syslog.py script
61 | amico_threshold = 0.4
62 | 
63 | # The name of the training model file to be used for
64 | # classification. Use the trainer.py script to create 
65 | # a new model specific to your network
66 | model_file = "models/default.model"
67 | 
68 | whitelist_domains = [
69 |             "windowsupdate.com",
70 |             "avg.com",
71 |             "microsoft.com",
72 |             "adobe.com",
73 |             "apple.com",
74 |             "google.com",
75 |             "se.360.cn"]
76 | 


--------------------------------------------------------------------------------
/file_dump/search.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * NOTE ON LICENCING FOR THIS FILE
  3 |  * The following code was borrowed from 
  4 |  * Wikipedia.org
  5 |  * For example it can be found here:
  6 |  * http://en.wikipedia.org/w/index.php?title=Boyer%E2%80%93Moore_string_search_algorithm&oldid=399934077
  7 |  */
  8 | 
  9 | # include <limits.h>
 10 | # include <string.h>
 11 |  
 12 | # define ALPHABET_SIZE (1 << CHAR_BIT)
 13 |  
 14 | static void compute_prefix(const char* str, size_t size, int result[size]) {
 15 |  size_t q;
 16 |  int k;
 17 |  result[0] = 0;
 18 |  
 19 |  k = 0;
 20 |  for (q = 1; q < size; q++) {
 21 |      while (k > 0 && str[k] != str[q])
 22 |          k = result[k-1];
 23 |  
 24 |      if (str[k] == str[q])
 25 |          k++;
 26 |      result[q] = k;
 27 |  }
 28 | }
 29 |  
 30 | static void prepare_badcharacter_heuristic(const char *str, size_t size,
 31 |  int result[ALPHABET_SIZE]) {
 32 |  
 33 |  size_t i;
 34 |  
 35 |  for (i = 0; i < ALPHABET_SIZE; i++)
 36 |      result[i] = -1;
 37 |  
 38 |  for (i = 0; i < size; i++)
 39 |      result[(size_t) str[i]] = i;
 40 | }
 41 |  
 42 | void prepare_goodsuffix_heuristic(const char *normal, size_t size,
 43 |  int result[size + 1]) {
 44 |  
 45 |  char *left = (char *) normal;
 46 |  char *right = left + size;
 47 |  char reversed[size+1];
 48 |  char *tmp = reversed + size;
 49 |  size_t i;
 50 |  
 51 |  /* reverse string */
 52 |  *tmp = 0;
 53 |  while (left < right)
 54 |      *(--tmp) = *(left++);
 55 |  
 56 |  int prefix_normal[size];
 57 |  int prefix_reversed[size];
 58 |  
 59 |  compute_prefix(normal, size, prefix_normal);
 60 |  compute_prefix(reversed, size, prefix_reversed);
 61 |  
 62 |  for (i = 0; i <= size; i++) {
 63 |      result[i] = size - prefix_normal[size-1];
 64 |  }
 65 |  
 66 |  for (i = 0; i < size; i++) {
 67 |      const int j = size - prefix_reversed[i];
 68 |      const int k = i - prefix_reversed[i]+1;
 69 |  
 70 |      if (result[j] > k)
 71 |          result[j] = k;
 72 |  }
 73 | }
 74 | /*
 75 |  * Boyer-Moore search algorithm
 76 |  */
 77 | const char *boyermoore_search(const char *haystack, const char *needle) {
 78 |  /*
 79 |  * Calc string sizes
 80 |  */
 81 |  size_t needle_len, haystack_len;
 82 |  needle_len = strlen(needle);
 83 |  haystack_len = strlen(haystack);
 84 |  
 85 |  /*
 86 |  * Simple checks
 87 |  */
 88 |  if(haystack_len == 0)
 89 |      return NULL;
 90 |  if(needle_len == 0)
 91 |      return haystack;
 92 |  
 93 |  /*
 94 |  * Initialize heuristics
 95 |  */
 96 |  int badcharacter[ALPHABET_SIZE];
 97 |  int goodsuffix[needle_len+1];
 98 |  
 99 |  prepare_badcharacter_heuristic(needle, needle_len, badcharacter);
100 |  prepare_goodsuffix_heuristic(needle, needle_len, goodsuffix);
101 |  
102 |  /*
103 |  * Boyer-Moore search
104 |  */
105 |  size_t s = 0;
106 |  while(s <= (haystack_len - needle_len))
107 |  {
108 |      size_t j = needle_len;
109 |      while(j > 0 && needle[j-1] == haystack[s+j-1])
110 |          j--;
111 |  
112 |      if(j > 0)
113 |      {
114 |          int k = badcharacter[(size_t) haystack[s+j-1]];
115 |          int m;
116 |          if(k < (int)j && (m = j-k-1) > goodsuffix[j])
117 |              s+= m;
118 |          else
119 |              s+= goodsuffix[j];
120 |      }
121 |      else
122 |      {
123 |          return haystack + s;
124 |      }
125 |  }
126 |  
127 |  /* not found */
128 |  return NULL;
129 | }
130 | 


--------------------------------------------------------------------------------
/amico_scripts/update_urls_fix.py:
--------------------------------------------------------------------------------
  1 | # Author: Phani Vadrevu
  2 | #
  3 | # This script fixes a bug related to empty URLs in Amico's DB
  4 | # It reparses raw file dumps to fill missing URLs
  5 | # It should only be used to correct missing URLs produced
  6 | # by the version of Amico's code before "dev" branch commit
  7 | # b1d39fcf158441af61a59a571b342e9826a46c9d 
  8 | 
  9 | import logging
 10 | import re
 11 | import os
 12 | 
 13 | import util
 14 | 
 15 | RAW_FILE_DIR = "/home/perdisci/amico/amico_scripts/parsed/raw_files/"
 16 | LOG_FILE = "/home/perdisci/amico/amico_scripts/parsed/update_urls_amico.log"
 17 | 
 18 | def update_url(file_path,conn):
 19 |     #print "Time b4 http parsing: %f" %(time.time(),)
 20 |     # Use Autocommit mode for database connection
 21 | 
 22 |     fileHandle = open(file_path)
 23 | 
 24 |     # Timestamp
 25 |     r = re.compile('[0-9]+')
 26 |     timestamp = r.search(fileHandle.readline())
 27 |     if timestamp is not None:
 28 |         timestamp = timestamp.group()
 29 |         #print timestamp.group()
 30 | 
 31 |     # Source and Destination IPs
 32 |     r = re.compile('([0-9.]+):.*-([0-9.]+):([0-9]+)-.*')
 33 |     ip = r.search(fileHandle.readline())
 34 |     if ip is not None:
 35 |         srcip = ip.group(2)
 36 |         dstip = ip.group(1)
 37 |         dst_port = ip.group(3)
 38 |         #print ip.group(1)
 39 |         #print ip.group(2)
 40 |     else:
 41 |         srcip = None
 42 |         dstip = None
 43 |         dst_port = None
 44 | 
 45 |     # URL
 46 |     # for efficiency purposes, skip files that were not affected by the bug
 47 |     url_line = fileHandle.readline()
 48 |     if " HTTP/1" in url_line:
 49 |         return
 50 | 
 51 |     r = re.compile('(GET|POST|HEAD) (.*)')
 52 |     url = r.search(url_line)
 53 |     if url is not None:
 54 |         method = url.group(1)
 55 |         method = method[:10]
 56 |         url = url.group(2)
 57 |         toks = url.split()
 58 |         url = toks[0]
 59 |         #print url.group(1)
 60 |     else:
 61 |         method = None
 62 | 
 63 |     if url is None or len(url.strip())==0:
 64 |         logging.warning('URL is empty for file: %s' % (file_path,))
 65 |         return
 66 | 
 67 | 
 68 |     cursor = conn.cursor()
 69 | 
 70 |     cursor.execute("""
 71 |         SELECT dump_id FROM pe_dumps
 72 |         WHERE timestamp = TO_TIMESTAMP(%s) AND server = %s AND client = %s
 73 |               AND dst_port = %s AND url IS NULL """, (timestamp, srcip, dstip, dst_port))
 74 |     if cursor.rowcount > 1:
 75 |         logging.warning('Found more than one dump_id for file: %s' % (file_path,))
 76 |     # elif cursor.rowcount == 0:
 77 |     #    logging.warning('Found no dump_id for file: %s', (file_path,))
 78 |     elif cursor.rowcount == 1:
 79 |         dump_id = cursor.fetchone()
 80 |         if len(url.strip())>0:
 81 |             cursor.execute("""
 82 |                 UPDATE pe_dumps SET url = %s
 83 |                 WHERE dump_id = %s """, (url.strip(), dump_id))
 84 |             logging.debug('Updated URL for dump_id: %s (file: %s | url: %s)' % (dump_id,file_path,url))
 85 | 
 86 | 
 87 | def main():
 88 |     conn = util.connect_to_db()
 89 | 
 90 |     logging.basicConfig(level=logging.DEBUG,
 91 |                         filename=LOG_FILE,
 92 |                         filemode='w')
 93 |     raw_file_names = os.listdir(RAW_FILE_DIR)
 94 |     for fn in raw_file_names:
 95 |         file_path = os.path.join(RAW_FILE_DIR, fn)
 96 |         print "Analyzing file:", file_path
 97 |         update_url(file_path,conn)
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     main()
102 | 


--------------------------------------------------------------------------------
/amico_scripts/vt_api.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Author: Phani Vadrevu <pvadrevu@uga.edu>
  3 | 
  4 | import os.path
  5 | import urllib
  6 | import urllib2
  7 | import random
  8 | 
  9 | import postfile
 10 | import config
 11 | from config import *
 12 | 
 13 | TIMEOUT = 10
 14 | 
 15 | 
 16 | def get_vt_key():
 17 |     #random.seed()
 18 |     k = random.randint(0, len(vt_keys) - 1)
 19 |     print "Using VT API key number", k
 20 |     return vt_keys[k]  # vt_keys must be a list of valid virust_total API keys
 21 | 
 22 | 
 23 | def send_file(md5):
 24 |     host = "www.virustotal.com"
 25 |     selector = "https://www.virustotal.com/vtapi/v2/file/scan"
 26 |     fields = [("apikey", get_vt_key())]
 27 | 
 28 |     dir_path = ""
 29 |     if vt_submissions == "manual":
 30 |         dir_path = MAN_DOWNLOAD_DIR
 31 |     else:
 32 |         dir_path = LIVE_DOWNLOAD_DIR 
 33 |     
 34 |     # just a patch to old code...
 35 |     # we only submit the first file that matches
 36 |     # it is anyway highly unlikely that more than one would match
 37 |     file_name = None
 38 |     file_path = None
 39 |     for ext in vt_submissions_ext:
 40 |         for e in [ext.lower(),ext.upper()]:
 41 |             fn = md5 + "." + e
 42 |             fp = os.path.join(dir_path,fn)
 43 |             if os.path.isfile(fp):
 44 |                 file_name = fn
 45 |                 file_path = fp
 46 |                 break;
 47 | 
 48 |     if file_path and os.path.isfile(file_path):
 49 |         print "VT file submission:", file_path
 50 |         file_to_send = open(file_path, "rb").read()
 51 |         files = [("file", file_name, file_to_send)]
 52 |         json = postfile.post_multipart(host, selector, fields, files)
 53 |         return json
 54 | 
 55 | 
 56 | # Either a singe hash or a list of hashes (upto 25) can be passed
 57 | def rescan_request(arg):
 58 |     if isinstance(arg, list):
 59 |         res = ""
 60 |         for file_hash in arg:
 61 |             res += file_hash + ', '
 62 |         res = res[:-2]
 63 |     else:
 64 |         res = arg
 65 |     url = "https://www.virustotal.com/vtapi/v2/file/rescan"
 66 |     parameters = {"resource": res,
 67 |                   "apikey": get_vt_key()}
 68 |     data = urllib.urlencode(parameters)
 69 |     req = urllib2.Request(url, data)
 70 |     try:
 71 |         response = urllib2.urlopen(req, timeout=5*TIMEOUT)
 72 |     except Exception as e:
 73 |         print "rescan_request: Exception occured", e
 74 |         return
 75 |     json = response.read()
 76 |     return json
 77 | 
 78 | 
 79 | # md5 or sha1 can also be used instead of scan_id
 80 | def get_vt_report(scan_id):
 81 |     url = "https://www.virustotal.com/vtapi/v2/file/report"
 82 |     parameters = {"resource": scan_id,
 83 |                   "apikey": get_vt_key()}
 84 |     data = urllib.urlencode(parameters)
 85 |     req = urllib2.Request(url, data)
 86 |     try:
 87 |         response = urllib2.urlopen(req, timeout=TIMEOUT)
 88 |     except Exception as e:
 89 |         print "get_vt_report: Exception occured", e
 90 |         return
 91 |     json = response.read()
 92 |     return json
 93 | 
 94 | 
 95 | def get_ip_report(ip):
 96 |     url = "https://www.virustotal.com/vtapi/v2/ip-address/report"
 97 |     parameters = {"ip": ip,
 98 |                   "apikey": get_vt_key()}
 99 |     data = urllib.urlencode(parameters)
100 |     req = urllib2.Request("%s?%s" % (url, data))
101 |     try:
102 |         response = urllib2.urlopen(req, timeout=TIMEOUT)
103 |     except Exception as e:
104 |         print "get_vt_report: Exception occured", e
105 |         return
106 |     json = response.read()
107 |     return json
108 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Overview #
 2 | 
 3 | **AMICO** is a malware download classification tool that can be deployed in large networks. It reconstructs executable files (e.g., EXE, DLL, DMG, APK, JAR, etc.) from the network traffic and determines if they are malicious or not based on their _provenance information_. 
 4 | 
 5 | To classify a file download event, AMICO looks at **who** is downloading **what** and from **where**, rather than analyzing the content of the downloaded files.
 6 | 
 7 | For more technical information, please refer to this [ESORICS 2013 research paper](http://www.perdisci.com/publications/publication-files/amico.pdf)
 8 | 
 9 | **Code**: The latest code, which reconstructs and classifies file dumps other than Windows PE executables, is in the `dev` branch (the `older_code` branch contains the original code dedicated to reconstructing only Windows PE files).
10 | 
11 | For more information on how to use and deploy AMICO, please go through the Wiki pages. This is an initial release of the system and we will keep refining the code and documentation. Please open a new Issue if you experience any problems.
12 | 
13 | You can also visit our [AMICO-Security Blog](http://amico-security.blogspot.com/).
14 | 
15 | 
16 | ## SETUP AND DEPLOYMENT ##
17 | Please refer to our [project's Wiki](https://github.com/perdisci/amico/wiki) for detailed information about system requirements, setup, and deployment guidelines.
18 | 
19 | 
20 | ## CONTACT US ##
21 | If you have any questions, please post a message on our [AMICO-security forum](https://groups.google.com/forum/#!forum/amico-security).
22 | 
23 | If you are deploying AMICO in a large _university-like campus network_ and would like to share your experience or know more about our own deployment, please contact us privately at (**perdisci [-at-] cs.uga.edu**).
24 | 
25 | 
26 | ## LICENSING ##
27 | The code under the "older_code" branch is released under BSD license. Please refer to the COPYING file under that branch for details.
28 | 
29 | ## News ##
30 |   * [01/17/2017] Written [some guidelines](https://amico-security.blogspot.com/2017/01/installing-pfring.html) on how to install pf_ring and ZC drivers
31 |   * [01/11/2016] Enabled submission of file types other than EXE to VirusTotal (in the experimental branch only).
32 |   * [04/29/2015] Improved [experimental branch code](https://github.com/perdisci/amico/tree/experimental), and tested capture and classification of APKs and JARs in a large network.
33 |   * [03/27/2015] All code in the master branch has been released under **BSD license**.
34 |   * [03/27/2015] Moved all project files from GoogleCode to GitHub.
35 |   * [01/14/2015] Added some documentation about [syslog reports format](https://github.com/perdisci/amico/wiki/Syslog-Reports-Format).
36 |   * [11/20/2014] Added experimental code for supporting file formats other than Windows PE (see svn/branches/experimental). We can currently extract most JAR, APK, DMG, ZIP, RAR, PDF files, and even some Microsoft Office documents. _Limitations_: the feature extraction and provenance classifier currently treat all file types the same way; we are performing more research to see if the behavior-based detection approach used by AMICO can still work well even with non-executable files.
37 |   * [11/08/2014] We have created the [AMICO-Security Blog](http://amico-security.blogspot.com/), where we discuss malware campaign discoveries and other related topics.
38 |   * [10/09/2014] Quick steps for [tuning packet capture](https://github.com/perdisci/amico/wiki/Tuning-Packet-Capture) and drastically reduce packet loss.
39 |   * [10/03/2014] Added a brief [example of how AMICO can be deployed](https://github.com/perdisci/amico/wiki/Deployment-Example) in a network.
40 |   * [09/15/2014] We recently fixed a number of rarely-triggered bugs and improved general code quality and stability.
41 |   * [09/13/2014] In the Wiki, you can now find more information about the [pe\_dump](https://github.com/perdisci/amico/wiki/pe_dump-Module) component of AMICO.
42 |   * [08/26/2014] We successfully built a PF\_RING-aware version of AMICO (see [how we did it](https://github.com/perdisci/amico/blob/master/external_libs/README))
43 | 


--------------------------------------------------------------------------------
/amico_scripts/classify_dump.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | ###########################################################################
  3 | # Copyright (C) 2013 Phani Vadrevu                                        #
  4 | # pvadrevu@uga.edu                                                        #
  5 | #                                                                         #
  6 | # Distributed under the GNU Public License                                #
  7 | # http://www.gnu.org/licenses/gpl.txt                                     #
  8 | #                                                                         #
  9 | # This program is free software; you can redistribute it and/or modify    #
 10 | # it under the terms of the GNU General Public License as published by    #
 11 | # the Free Software Foundation; either version 2 of the License, or       #
 12 | # (at your option) any later version.                                     #
 13 | #                                                                         #
 14 | ###########################################################################
 15 | import sys
 16 | import subprocess
 17 | 
 18 | import psycopg2.extras
 19 | 
 20 | import util
 21 | from features import features
 22 | from config import model_file
 23 | 
 24 | output_file = "test.arff"
 25 | 
 26 | 
 27 | def print_arff(dump_id):
 28 |     conn = util.connect_to_db()
 29 |     cursor = conn.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor)
 30 |     cursor.execute("""
 31 |         SELECT * FROM weka_features
 32 |         WHERE dump_id = %s""",
 33 |         (dump_id, ))
 34 |     if cursor.rowcount == 0:
 35 |         print "Feature vector not found. Exiting..."
 36 |         return
 37 |     res = cursor.fetchone()
 38 |     res = res._asdict()
 39 |     del res['raw_dump_num_av_labels']
 40 |     del res['raw_dump_trusted_av_labels']
 41 | 
 42 |     w = open(output_file, 'w')
 43 |     w.write('@RELATION test\n\n')
 44 |     values = []
 45 |     for feature in features:
 46 |         if feature in ['sha1', 'dump_id', 'host', 'corrupt', 'vt_month_shelf',
 47 |                 'url_struct']:
 48 |             data_type = "STRING"
 49 |         elif feature == "extension_class":
 50 |             data_type = ("{common_ext,unknown_ext,common_fake,other_ext,"
 51 |                    "no_url,no_ext}")
 52 |         else:
 53 |             data_type = "NUMERIC"
 54 |         w.write('@ATTRIBUTE %s %s\n' % (feature, data_type))
 55 |         values.append(res[feature])
 56 |         #print "%s : %s" % (key, res[key])
 57 | 
 58 |     w.write('@ATTRIBUTE class {pos, neg}\n\n')
 59 |     w.write('@DATA\n\n')
 60 |     try:
 61 |         data_string = ','.join(['?' if (value is None or value is '') else
 62 |             str(value) for value in values])
 63 |     except Exception as e:
 64 |         print "Error in writing feature vector to file!", e
 65 |     else:
 66 |         data_string += ",?"
 67 |         w.write(data_string + '\n')
 68 |     w.close()
 69 |     cursor.close()
 70 |     conn.close()
 71 | 
 72 | 
 73 | def classify_dump(dump_id):
 74 |     print_arff(dump_id)
 75 |     subprocess.call(
 76 |             "java -Xmx2000m -cp ./weka.jar "
 77 |             "weka.classifiers.meta.FilteredClassifier "
 78 |             "-l %s -p 1,58,59 -distribution -T test.arff "
 79 |             "> test.result" % (model_file,), shell=True)
 80 | 
 81 | 
 82 |     score = None
 83 |     with open('test.result', 'r') as f:
 84 |         for line in f:
 85 |             if ':' in line:
 86 |                 for word in line.split():
 87 |                     if '*' in word:
 88 |                         score = word.split(',')[0]
 89 |                         if score.startswith('*'):
 90 |                             score = score[1:]
 91 |     subprocess.call("rm test.arff", shell=True)
 92 |     subprocess.call("rm test.result", shell=True)
 93 | 
 94 |     print "AMICO Score:", score
 95 |     update_score(dump_id,score)
 96 | 
 97 |     return score
 98 | 
 99 | 
100 | def update_score(dump_id,score):
101 |     conn = util.connect_to_db()
102 |     cursor = conn.cursor()
103 |     cursor.execute("""
104 |             DELETE FROM amico_scores
105 |             WHERE dump_id = %s""",
106 |             (dump_id, ))
107 |     cursor.execute("INSERT INTO amico_scores VALUES "
108 |                    "(%s, %s)", (dump_id, score))
109 | 
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     dump_id = int(sys.argv[1])
114 |     #print_arff(dump_id)  # For testing
115 |     classify_dump(dump_id)
116 | 


--------------------------------------------------------------------------------
/amico_scripts/util.py:
--------------------------------------------------------------------------------
  1 | ##########################################################################
  2 | # Copyright (C) 2012 Phani Vadrevu                                        #
  3 | # pvadrevu@uga.edu                                                        #
  4 | #                                                                         #
  5 | # Distributed under the GNU Public License                                #
  6 | # http://www.gnu.org/licenses/gpl.txt                                     #
  7 | #                                                                         #
  8 | # This program is free software; you can redistribute it and/or modify    #
  9 | # it under the terms of the GNU General Public License as published by    #
 10 | # the Free Software Foundation; either version 2 of the License, or       #
 11 | # (at your option) any later version.                                     #
 12 | #                                                                         #
 13 | ###########################################################################
 14 | 
 15 | """
 16 | Utitily functions should be added here
 17 | """
 18 | import re
 19 | import socket
 20 | import socks
 21 | import psycopg2
 22 | import etld
 23 | 
 24 | from config import *
 25 | 
 26 | 
 27 | def connect_to_db():
 28 |     try:
 29 |         conn = psycopg2.connect("dbname=%s host=%s user=%s password=%s"
 30 |             % (db_name, db_host, db_user, db_password))
 31 |     except Exception as e:
 32 |         print "Unable to connect to database: " + db_name
 33 |         print e
 34 |     conn.set_isolation_level(0)
 35 |     return conn
 36 | 
 37 | 
 38 | # Reorder the subdomains in the host name such that
 39 | # the TLD comes first. Eg: com.google.www
 40 | def reorder_domain(host):
 41 |     if host is None:
 42 |         return None
 43 | 
 44 |     try:
 45 |         host = host.split(':')[0]  # in case host string contains port
 46 |         ipreg = re.compile("[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$")
 47 |         if ipreg.match(host) is None:
 48 |             ordered_host = ""
 49 |             host += '.'
 50 |             domains = re.findall('.*?\.', host)
 51 |             for i in range(len(domains)):
 52 |                 ordered_host += domains[len(domains) - i - 1]
 53 |             ordered_host = ordered_host[:-1]
 54 |             return ordered_host
 55 |         else:
 56 |             return host
 57 |     except Exception as e:
 58 |         print "exception in reorder_domain for host: %s" % (host,)
 59 |         print e
 60 |         return host
 61 | 
 62 | 
 63 | def is_ip(string):
 64 |     ipreg = re.compile("[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$")
 65 |     if ipreg.match(string) is not None:
 66 |         return True
 67 |     else:
 68 |         return False
 69 | 
 70 | 
 71 | def extract_extension(url):
 72 |     file_name = url.split('?')[0].split('/')[-1]
 73 |     if '.' in file_name:
 74 |         ext = file_name.split('.')[-1]
 75 |         return ext
 76 |     else:
 77 |         return None
 78 | 
 79 | 
 80 | def extract_twold(hostname):
 81 |     if hostname is None:
 82 |         return None
 83 | 
 84 |     hostname = hostname.strip()
 85 |     if len(hostname) == 0:
 86 |         return None
 87 |     if isIP4Address(hostname):
 88 |         return None
 89 | 
 90 |     try:
 91 |         etld_obj = etld.etld()
 92 |         registered = ''
 93 |         suffix = ''
 94 |         registered, suffix = etld_obj.parse(hostname)
 95 |         twold = '.'.join([registered.split('.')[-1], suffix])
 96 |         print "hostname: %s -- twold: %s" % (hostname,twold)
 97 |         return twold
 98 |     except:
 99 |         print "Unable to compute twold: hostname: %s" % (hostname,)
100 | 
101 |     return None
102 | 
103 | def isIP4Address(hostname):
104 |     ip4reg = re.compile(
105 |             "([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})$")
106 |     m = ip4reg.match(hostname)
107 |     if m is not None:
108 |         return True
109 | 
110 | # Reverse the IP address for querying origin.asn.cymru.com
111 | def reverse_ip(ip):
112 |     ipreg = re.compile(
113 |             "([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})$")
114 |     m = ipreg.match(ip)
115 |     if m is not None:
116 |         return (m.group(4) + "." + m.group(3) + "." + m.group(2)
117 |                 + "." + m.group(1))
118 | 
119 | 
120 | # Setup SOCKS proxy
121 | def setup_socks():
122 |     if socks_proxy_host:
123 |         socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, socks_proxy_host,
124 |                 socks_proxy_port)
125 |         socket.socket = socks.socksocket
126 | 


--------------------------------------------------------------------------------
/amico_scripts/fe_db_setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | ###########################################################################
  3 | # Copyright (C) 2012 Phani Vadrevu                                        #
  4 | # pvadrevu@uga.edu                                                        #
  5 | #                                                                         #
  6 | # Distributed under the GNU Public License                                #
  7 | # http://www.gnu.org/licenses/gpl.txt                                     #   
  8 | #                                                                         #
  9 | # This program is free software; you can redistribute it and/or modify    #
 10 | # it under the terms of the GNU General Public License as published by    #
 11 | # the Free Software Foundation; either version 2 of the License, or       #
 12 | # (at your option) any later version.                                     #
 13 | #                                                                         #
 14 | ###########################################################################
 15 | import util
 16 | import sys
 17 | from config import *
 18 | 
 19 | def fe_db_setup():
 20 |     conn = util.connect_to_db()
 21 |     cursor = conn.cursor()
 22 |     
 23 |     cursor.execute(""" DROP table if exists features""")
 24 |     cursor.execute(""" DROP table if exists weka_features""")
 25 |     cursor.execute("""
 26 |         CREATE TABLE weka_features(
 27 |             dump_id INT,
 28 |             raw_dump_num_av_labels INT,
 29 |             raw_dump_trusted_av_labels INT,
 30 |             vt_month_shelf BOOLEAN,
 31 |             corrupt BOOLEAN,
 32 |             host_malware_downloads INT,
 33 |             host_suspicious_downloads INT,
 34 |             host_benign_downloads INT,
 35 |             host_total_downloads INT,
 36 |             host_malware_ratio REAL,
 37 |             host_suspicious_ratio REAL,
 38 |             host_benign_ratio REAL,
 39 |             host_avg_av_labels REAL,
 40 |             host_avg_trusted_labels REAL,
 41 |             host_unknown_hashes INT,
 42 |             host_total_hashes INT,
 43 |             host_unknown_hash_ratio REAL,
 44 |             twold_malware_downloads INT,
 45 |             twold_suspicious_downloads INT,
 46 |             twold_benign_downloads INT,
 47 |             twold_total_downloads INT,
 48 |             twold_malware_ratio REAL,
 49 |             twold_suspicious_ratio REAL,
 50 |             twold_benign_ratio REAL,
 51 |             twold_avg_av_labels REAL,
 52 |             twold_avg_trusted_labels REAL,
 53 |             twold_unknown_hashes INT,
 54 |             twold_total_hashes INT,
 55 |             twold_unknown_hash_ratio REAL,
 56 |             server_ip_malware_downloads INT,
 57 |             server_ip_suspicious_downloads INT,
 58 |             server_ip_benign_downloads INT,
 59 |             server_ip_total_downloads INT,
 60 |             server_ip_malware_ratio REAL,
 61 |             server_ip_suspicious_ratio REAL,
 62 |             server_ip_benign_ratio REAL,
 63 |             server_ip_avg_av_labels REAL,
 64 |             server_ip_avg_trusted_labels REAL,
 65 |             server_ip_unknown_hashes INT,
 66 |             server_ip_total_hashes INT,
 67 |             server_ip_unknown_hash_ratio REAL,
 68 |             bgp_malware_downloads INT,
 69 |             bgp_suspicious_downloads INT,
 70 |             bgp_benign_downloads INT,
 71 |             bgp_total_downloads INT,
 72 |             bgp_malware_ratio REAL,
 73 |             bgp_suspicious_ratio REAL,
 74 |             bgp_benign_ratio REAL,
 75 |             bgp_avg_av_labels REAL,
 76 |             bgp_avg_trusted_labels REAL,
 77 |             bgp_unknown_hashes INT,
 78 |             bgp_total_hashes INT,
 79 |             bgp_unknown_hash_ratio REAL,
 80 |             hash_life_time INT,
 81 |             num_dumps_with_same_hash INT,
 82 |             hash_daily_dump_rate_per_client REAL,
 83 |             estimated_clients_with_same_hash INT,
 84 |             referer_exists INT,
 85 |             host_name_exists INT,
 86 |             extension_class VARCHAR(20),
 87 |             url_length INT,
 88 |             directory_depth INT,
 89 |             sha1 VARCHAR(40),
 90 |             host VARCHAR(256),
 91 |             url_malware_downloads INT,
 92 |             url_total_downloads INT,
 93 |             url_distinct_sha1s INT,
 94 |             url_struct VARCHAR(512),
 95 |             url_struct_malware_downloads INT,
 96 |             url_struct_total_downloads INT,
 97 |             url_struct_distinct_sha1s INT)
 98 |             """)
 99 | 
100 |     print "Created weka_features table!"
101 | 
102 |     conn.commit()
103 |     cursor.close()
104 |     conn.close()
105 | 
106 | if __name__ == '__main__':
107 |     sys.exit(main())
108 | 


--------------------------------------------------------------------------------
/amico_scripts/db_pe_dumps.py:
--------------------------------------------------------------------------------
  1 | ###########################################################################
  2 | # Copyright (C) 2011 Phani Vadrevu                                        #
  3 | # pvadrevu@uga.edu                                                        #
  4 | #                                                                         #
  5 | # Distributed under the GNU Public License                                #
  6 | # http://www.gnu.org/licenses/gpl.txt                                     #
  7 | #                                                                         #
  8 | # This program is free software; you can redistribute it and/or modify    #
  9 | # it under the terms of the GNU General Public License as published by    #
 10 | # the Free Software Foundation; either version 2 of the License, or       #
 11 | # (at your option) any later version.                                     #
 12 | #                                                                         #
 13 | ###########################################################################
 14 | 
 15 | import re
 16 | import sys
 17 | from config import *
 18 | 
 19 | import util
 20 | 
 21 | def db_pe_dumps(file_path, sha1, md5, file_size):
 22 |     #print "Time b4 http parsing: %f" %(time.time(),)
 23 |     # Use Autocommit mode for database connection
 24 |     conn = util.connect_to_db()
 25 |     cursor = conn.cursor()
 26 | 
 27 |     fileHandle = open(file_path)
 28 | 
 29 |     # Timestamp
 30 |     r = re.compile('[0-9]+')
 31 |     timestamp = r.search(fileHandle.readline())
 32 |     if timestamp is not None:
 33 |         timestamp = timestamp.group()
 34 |         #print timestamp.group()
 35 | 
 36 |     # Source and Destination IPs
 37 |     r = re.compile('([0-9.]+):.*-([0-9.]+):([0-9]+)-.*')
 38 |     ip = r.search(fileHandle.readline())
 39 |     if ip is not None:
 40 |         srcip = ip.group(2)
 41 |         dstip = ip.group(1)
 42 |         dst_port = ip.group(3)
 43 |         #print ip.group(1)
 44 |         #print ip.group(2)
 45 |     else:
 46 |         srcip = None
 47 |         dstip = None
 48 |         dst_port = None
 49 | 
 50 |     # URL
 51 |     r = re.compile('(GET|POST|HEAD) (.*) ')
 52 |     url = r.search(fileHandle.readline())
 53 |     if url is not None:
 54 |         method = url.group(1)
 55 |         method = method[:10]
 56 |         url = url.group(2)
 57 |         #print url.group(1)
 58 |     else:
 59 |         method = None
 60 | 
 61 | 
 62 |     # Host
 63 |     r = re.compile('Host: (.*)')
 64 |     host = r.search(fileHandle.readline())
 65 |     if host is not None:
 66 |         host = host.group(1)
 67 |         host = util.reorder_domain(host.strip())
 68 |         #print host.group(1)
 69 | 
 70 | 
 71 |     # Referer
 72 |     r = re.compile('Referer: (.*)')
 73 |     referer = r.search(fileHandle.readline())
 74 |     if referer is not None:
 75 |         referer = referer.group(1)
 76 |         #print referrer.group(1)
 77 | 
 78 | 
 79 |     # CORRUPT_PE
 80 |     corrupt_pe = False
 81 |     r = re.compile('CORRUPT_(PE|FILE)')
 82 |     corrupt_pe_str = r.search(fileHandle.readline())
 83 |     if corrupt_pe_str is not None:
 84 |         corrupt_pe = True
 85 | 
 86 | 
 87 |     # Now, parse data from the response
 88 |     # Server
 89 |     data = fileHandle.read()
 90 |     r = re.compile('Server: (.*)')
 91 |     server = r.search(data)
 92 |     if server is not None:
 93 |         server = server.group(1)
 94 |         server = server.rstrip('\r')
 95 |         server = server[:64]
 96 | 
 97 |     # Content-Type
 98 |     r = re.compile('Content-Type: (.*)')
 99 |     cont_type = r.search(data)
100 |     if cont_type is not None:
101 |         cont_type = cont_type.group(1)
102 |         cont_type = cont_type.rstrip('\r')
103 |         cont_type = cont_type[:128]
104 | 
105 |     #print "Time after http parsing: %f" %(time.time(),)
106 |     # Database statement
107 |     cursor.execute("""
108 |         INSERT INTO pe_dumps(sha1,md5,timestamp,server,client,method,url,host,
109 |         referer,server_application,content_type,dst_port,corrupt,file_size)
110 |         VALUES
111 |         (%s,%s,TO_TIMESTAMP(%s),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
112 |         (sha1, md5, timestamp, srcip, dstip, method, url, host, referer, server,
113 |         cont_type, dst_port, corrupt_pe, file_size))
114 |     cursor.execute("""
115 |         SELECT dump_id FROM pe_dumps where sha1 = %s ORDER BY dump_id DESC
116 |         """, (sha1,))
117 |     dump_id = cursor.fetchone()[0]
118 |     print ("A new entry on host:%s has been made in pe_dumps table with "
119 |           "dump_id %s" % (host, dump_id))
120 | 
121 |     fileHandle.close()
122 |     cursor.close()
123 |     conn.close()
124 |     return dump_id, corrupt_pe
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     file_path = sys.argv[1]
129 |     sha1 = sys.argv[2]
130 |     md5 = sys.argv[3]
131 |     file_size = sys.argv[4]
132 |     db_pe_dumps(file_path, sha1, md5, file_size)
133 | 


--------------------------------------------------------------------------------
/file_dump/seq_list.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *   Copyright (C) 2011  Roberto Perdisci (perdisci@cs.uga.edu)
  3 |  *
  4 |  *   This program is free software: you can redistribute it and/or modify
  5 |  *   it under the terms of the GNU General Public License as published by
  6 |  *   the Free Software Foundation, either version 3 of the License, or
  7 |  *   (at your option) any later version.
  8 |  *
  9 |  *   This program is distributed in the hope that it will be useful,
 10 |  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |  *   GNU General Public License for more details.
 13 |  *
 14 |  *   You should have received a copy of the GNU General Public License
 15 |  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 |  */
 17 | 
 18 | #include "seq_list.h"
 19 | 
 20 | seq_list_t *seq_list_init(void) {
 21 | 
 22 |     seq_list_t *l = (seq_list_t*)malloc(sizeof(seq_list_t));
 23 |     if(l == NULL) {
 24 |         printf("Failed to initialize seq_list! Out of memory???\n");
 25 |         fflush(stdout);
 26 |         exit(1);
 27 |     }
 28 | 
 29 |     memset(l,0,sizeof(seq_list_t));
 30 |     l->head = NULL;
 31 |     l->tail = NULL;
 32 |     l->next = NULL;
 33 | 
 34 |     return l;
 35 | }
 36 | 
 37 | void seq_list_destroy(seq_list_t* l, int mz_found) {
 38 | 
 39 |     /* DEBUG 
 40 |     if(mz_found) {
 41 |         printf("Calling seq_list_destroy!!!\n");
 42 |         fflush(stdout);
 43 |     }
 44 |     */
 45 | 
 46 | 
 47 |     if(l == NULL)
 48 |         return;
 49 | 
 50 |     seq_list_entry_t *h = l->head;
 51 |     seq_list_entry_t *n;
 52 | 
 53 |     while(h != NULL) {
 54 |         n = h->next;
 55 |         free(h);
 56 |         h = n;
 57 |     }
 58 | 
 59 |     l->head = NULL;
 60 |     l->tail = NULL;
 61 |     l->next = NULL;
 62 |     
 63 |     free(l);
 64 | 
 65 | 
 66 |     /* DEBUG
 67 |     if(mz_found) {
 68 |         printf("Destroyed seq_list!!!\n");
 69 |         fflush(stdout);
 70 |     }
 71 |     */
 72 | }
 73 | 
 74 | void seq_list_insert(seq_list_t *l, u_int sn, u_int ps) {
 75 | 
 76 |     if(l == NULL)
 77 |         return;
 78 | 
 79 |     seq_list_entry_t *e = (seq_list_entry_t*)malloc(sizeof(seq_list_entry_t));
 80 |     if(e == NULL) {
 81 |         printf("Error allocating memory for insering element in seq_list; Out of memory???\n");
 82 |         fflush(stdout);
 83 |         exit(1);
 84 |     }
 85 | 
 86 |     // initialize the new element
 87 |     memset(e,0,sizeof(seq_list_entry_t));
 88 |     e->sn = sn;
 89 |     e->ps = ps;
 90 |     e->next = NULL;
 91 |     
 92 |     if(l->head == NULL) {
 93 |         l->head = e;
 94 |         l->tail = e;
 95 |         l->next = e;
 96 | 
 97 |         return;
 98 |     }
 99 | 
100 |     if(l->tail == NULL) {
101 |         printf("Error: list tail cannot be null here!\n");
102 |         fflush(stdout);
103 |         exit(1);
104 |     }
105 |     l->tail->next = e;
106 |     l->tail = e;
107 |     
108 | }
109 | 
110 | seq_list_entry_t *seq_list_head(seq_list_t *l) {
111 | 
112 |     if(l == NULL)
113 |         return NULL;
114 | 
115 |     return l->head;
116 | }
117 | 
118 | seq_list_entry_t *seq_list_tail(seq_list_t *l) {
119 | 
120 |     if(l == NULL)
121 |         return NULL;
122 | 
123 |     return l->tail;
124 | }
125 | 
126 | seq_list_entry_t *seq_list_next(seq_list_t *l) {
127 | 
128 |     if(l == NULL)
129 |         return NULL;
130 | 
131 |     if(l->next == NULL)
132 |         return NULL;
133 | 
134 |     seq_list_entry_t *n = l->next;
135 |     l->next = l->next->next;
136 | 
137 |     return n;
138 | 
139 | }
140 | 
141 | void seq_list_restart_from_head(seq_list_t *l) {
142 | 
143 |     if(l == NULL)
144 |         return;
145 | 
146 |     l->next = l->head;
147 | 
148 | }
149 | 
150 | void seq_list_restart_from_element(seq_list_t *l, seq_list_entry_t *e) {
151 | 
152 |     if(l == NULL)
153 |         return;
154 | 
155 |     l->next = e;
156 | 
157 | }
158 | 
159 | u_int seq_list_get_seq_num(seq_list_entry_t *e) {
160 | 
161 |     if(e == NULL)
162 |         return 0;
163 |     return e->sn;
164 | }
165 | 
166 | 
167 | u_int seq_list_get_payload_size(seq_list_entry_t *e) {
168 | 
169 |     if(e == NULL)
170 |         return 0;
171 |     return e->ps;
172 | }
173 | 
174 | 
175 | void seq_list_print(seq_list_t *l) {
176 | 
177 |     if(l == NULL)
178 |         return;
179 | 
180 |     seq_list_entry_t *e = l->head;
181 |     while(e != NULL) {
182 |         printf("(%u,%u) ", e->sn, e->ps);
183 |         e = e->next;
184 |     }
185 |     printf("\n");
186 | 
187 | }
188 | 
189 | /* For debugging purposes */
190 | /**
191 | int main(void) {
192 | 
193 |     seq_list_t *l = seq_list_init();
194 | 
195 |     seq_list_insert(l,1,10);
196 |     seq_list_insert(l,5,8);
197 |     seq_list_insert(l,11,100);
198 |     seq_list_insert(l,45,190);
199 | 
200 |     seq_list_print(l);
201 | 
202 |     seq_list_destroy(l);
203 | 
204 |     return 0;
205 | 
206 | }
207 | **/
208 | 


--------------------------------------------------------------------------------
/amico_scripts/manual_download.py:
--------------------------------------------------------------------------------
  1 | ###########################################################################
  2 | # Copyright (C) 2011 Phani Vadrevu, Roberto Perdisci                      #
  3 | # pvadrevu@uga.edu                                                        #
  4 | # perdisci@cs.uga.edu                                                     #
  5 | #                                                                         #
  6 | # Distributed under the GNU Public License                                #
  7 | # http://www.gnu.org/licenses/gpl.txt                                     #   
  8 | #                                                                         #
  9 | # This program is free software; you can redistribute it and/or modify    #
 10 | # it under the terms of the GNU General Public License as published by    #
 11 | # the Free Software Foundation; either version 2 of the License, or       #
 12 | # (at your option) any later version.                                     #
 13 | #                                                                         #
 14 | ###########################################################################
 15 | 
 16 | import sys
 17 | import re
 18 | import time
 19 | import hashlib
 20 | from struct import unpack
 21 | from config import capture_file_types
 22 | from extract_file import extract_file_type
 23 | 
 24 | import urllib2
 25 | 
 26 | import util
 27 | from config import MAN_DOWNLOAD_DIR
 28 | 
 29 | USER_AGENT = "Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)"
 30 | HTTP_TIMEOUT = 40  # HTTP Request timeout
 31 | 
 32 | 
 33 | # Take the request, download the file and generate sha1 and md5 hashes
 34 | # When the file is a valid pe and different from previous, then, save
 35 | # it to the downloads directory
 36 | def download_file(dump_id, req, captured_sha1):
 37 |     # Make the request
 38 |     try:
 39 |         res = urllib2.urlopen(req, timeout=HTTP_TIMEOUT).read()
 40 |     except urllib2.URLError, e:
 41 |         res = None
 42 |         print "Error making the manual download", e
 43 | 
 44 |     sha1 = None
 45 |     md5 = None
 46 |     is_interesting_file = None
 47 | 
 48 |     if res is None:
 49 |         print "Executable could not be downloaded manually"
 50 |     else:
 51 |         file_type = extract_file_type(res)
 52 |         if file_type in capture_file_types:
 53 |             print "Manually downloaded", file_type, "file"
 54 |             sha1 = hashlib.sha1(res).hexdigest()
 55 | 
 56 |             # Store the downloaded file in a sub directory as md5.exe
 57 |             md5 = hashlib.md5(res).hexdigest()
 58 | 
 59 |             download_file = open(MAN_DOWNLOAD_DIR + "/" + md5 + "." + file_type, "w")
 60 |             download_file.write(res)
 61 |             download_file.close()
 62 |             print "Written " + MAN_DOWNLOAD_DIR + "/" + md5 + "." + file_type
 63 |             is_interesting_file = True
 64 |         else:
 65 |             print "Manually downloaded an uninteresting file!"
 66 |             is_interesting_file = False
 67 | 
 68 |     if captured_sha1 != sha1:
 69 |         different = True
 70 |         print "Checksums did not match for dump_id: ", dump_id
 71 |         print captured_sha1, "!=", sha1
 72 |     else:
 73 |         different = False
 74 | 
 75 |     return sha1, md5, different, is_interesting_file
 76 | 
 77 | 
 78 | def manual_download(captured_sha1):
 79 |     util.setup_socks()
 80 |     conn = util.connect_to_db()
 81 |     cursor = conn.cursor()
 82 | 
 83 |     # Database query to get the relevant recent record
 84 |     cursor.execute("""
 85 |         SELECT dump_id,host,url,referer,client,server FROM pe_dumps WHERE sha1 = %s
 86 |             ORDER BY timestamp DESC;""", (captured_sha1,))
 87 |     row = cursor.fetchone()
 88 |     dump_id = row[0]
 89 |     host = row[1]
 90 |     url = row[2]
 91 |     referer = row[3]
 92 |     client = row[4]
 93 |     server = row[5]
 94 | 
 95 |     full_url = "http://"
 96 |     ordered_host = server # if host is null, we use ther server IP
 97 |     if host:
 98 |         ordered_host = util.reorder_domain(host)
 99 |     full_url += ordered_host
100 |     if url:
101 |         full_url += url
102 |     print "Starting manual download from :", full_url
103 | 
104 |     # Prepare the urllib2 request
105 |     req = urllib2.Request(full_url)
106 |     req.add_header("User-Agent", USER_AGENT)
107 | 
108 |     download_time = time.time()
109 |     sha1, md5, different, is_interesting_file = download_file(dump_id, req, captured_sha1)
110 | 
111 |     # Database statement
112 |     cursor.execute("""
113 |         INSERT INTO manual_download_checksums(dump_id, sha1,
114 |         md5, different, referer_exists, timestamp, is_pe)
115 |         VALUES (%s, %s, %s, %s, %s, TO_TIMESTAMP(%s), %s)""",
116 |         (dump_id, sha1, md5, different, False, download_time, is_interesting_file))
117 | 
118 |     cursor.close()
119 |     conn.close()
120 | 
121 | if __name__ == "__main__":
122 |     manual_download(sys.argv[1])
123 | 


--------------------------------------------------------------------------------
/amico_scripts/db_file_dumps.py:
--------------------------------------------------------------------------------
  1 | ###########################################################################
  2 | # Copyright (C) 2014 Phani Vadrevu, Roberto Perdisci                      #
  3 | # pvadrevu@uga.edu                                                        #
  4 | #                                                                         #
  5 | # Distributed under the GNU Public License                                #
  6 | # http://www.gnu.org/licenses/gpl.txt                                     #
  7 | #                                                                         #
  8 | # This program is free software; you can redistribute it and/or modify    #
  9 | # it under the terms of the GNU General Public License as published by    #
 10 | # the Free Software Foundation; either version 2 of the License, or       #
 11 | # (at your option) any later version.                                     #
 12 | #                                                                         #
 13 | ###########################################################################
 14 | 
 15 | import re
 16 | import sys
 17 | from config import *
 18 | 
 19 | import util
 20 | 
 21 | def db_file_dumps(file_path, sha1, md5, file_size, file_type):
 22 |     #print "Time b4 http parsing: %f" %(time.time(),)
 23 |     # Use Autocommit mode for database connection
 24 |     conn = util.connect_to_db()
 25 |     cursor = conn.cursor()
 26 | 
 27 |     fileHandle = open(file_path)
 28 | 
 29 |     # Timestamp
 30 |     r = re.compile('[0-9]+')
 31 |     timestamp = r.search(fileHandle.readline())
 32 |     if timestamp is not None:
 33 |         timestamp = timestamp.group()
 34 |         #print timestamp.group()
 35 | 
 36 |     # Source and Destination IPs
 37 |     r = re.compile('([0-9.]+):.*-([0-9.]+):([0-9]+)-.*')
 38 |     ip = r.search(fileHandle.readline())
 39 |     if ip is not None:
 40 |         srcip = ip.group(2)
 41 |         dstip = ip.group(1)
 42 |         dst_port = ip.group(3)
 43 |         #print ip.group(1)
 44 |         #print ip.group(2)
 45 |     else:
 46 |         srcip = None
 47 |         dstip = None
 48 |         dst_port = None
 49 | 
 50 |     # URL
 51 |     r = re.compile('(GET|POST|HEAD) (.*)')
 52 |     url = r.search(fileHandle.readline())
 53 |     if url is not None:
 54 |         method = url.group(1)
 55 |         method = method[:10]
 56 |         url = url.group(2)
 57 |         toks = url.split()
 58 |         url = toks[0]
 59 |         #print url.group(1)
 60 |     else:
 61 |         method = None
 62 | 
 63 | 
 64 |     # Host
 65 |     r = re.compile('Host: (.*)')
 66 |     host = r.search(fileHandle.readline())
 67 |     if host is not None:
 68 |         host = host.group(1)
 69 |         host = util.reorder_domain(host.strip())
 70 |         #print host.group(1)
 71 | 
 72 | 
 73 |     # Referer
 74 |     r = re.compile('Referer: (.*)')
 75 |     referer = r.search(fileHandle.readline())
 76 |     if referer is not None:
 77 |         referer = referer.group(1)
 78 |         #print referrer.group(1)
 79 | 
 80 | 
 81 |     # CORRUPT_PE
 82 |     corrupt_pe = False
 83 |     r = re.compile('CORRUPT_FILE')
 84 |     corrupt_pe_str = r.search(fileHandle.readline())
 85 |     if corrupt_pe_str is not None:
 86 |         corrupt_pe = True
 87 | 
 88 | 
 89 |     # Now, parse data from the response
 90 |     # Server
 91 |     data = fileHandle.read()
 92 |     r = re.compile('Server: (.*)')
 93 |     server = r.search(data)
 94 |     if server is not None:
 95 |         server = server.group(1)
 96 |         server = server.rstrip('\r')
 97 |         server = server[:64]
 98 | 
 99 |     # Content-Type
100 |     r = re.compile('Content-Type: (.*)')
101 |     cont_type = r.search(data)
102 |     if cont_type is not None:
103 |         cont_type = cont_type.group(1)
104 |         cont_type = cont_type.rstrip('\r')
105 |         cont_type = cont_type[:128]
106 | 
107 |     #print "Time after http parsing: %f" %(time.time(),)
108 |     # Database statement
109 |     cursor.execute("""
110 |         INSERT INTO pe_dumps(sha1,md5,timestamp,server,client,method,url,host,
111 |         referer,server_application,content_type,dst_port,corrupt,file_size,file_type)
112 |         VALUES
113 |         (%s,%s,TO_TIMESTAMP(%s),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
114 |         (sha1, md5, timestamp, srcip, dstip, method, url, host, referer, server,
115 |         cont_type, dst_port, corrupt_pe, file_size, file_type))
116 |     cursor.execute("""
117 |         SELECT dump_id FROM pe_dumps where sha1 = %s ORDER BY dump_id DESC LIMIT 1
118 |         """, (sha1,))
119 |     dump_id = cursor.fetchone()[0]
120 |     print ("A new entry on host:%s has been made in pe_dumps table with "
121 |           "dump_id %s" % (host, dump_id))
122 | 
123 |     fileHandle.close()
124 |     cursor.close()
125 |     conn.close()
126 |     return dump_id, corrupt_pe, host, dstip, srcip
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     file_path = sys.argv[1]
131 |     sha1 = sys.argv[2]
132 |     md5 = sys.argv[3]
133 |     file_size = sys.argv[4]
134 |     file_type = sys.argv[5]
135 |     db_file_dumps(file_path, sha1, md5, file_size, file_type)
136 | 
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/amico_scripts/ip2asn.py:
--------------------------------------------------------------------------------
  1 | ##########################################################################
  2 | # Copyright (C) 2011 Phani Vadrevu                                        #
  3 | # pvadrevu@uga.edu                                                        #
  4 | #                                                                         #
  5 | # Distributed under the GNU Public License                                #
  6 | # http://www.gnu.org/licenses/gpl.txt                                     #
  7 | #                                                                         #
  8 | # This program is free software; you can redistribute it and/or modify    #
  9 | # it under the terms of the GNU General Public License as published by    #
 10 | # the Free Software Foundation; either version 2 of the License, or       #
 11 | # (at your option) any later version.                                     #
 12 | #                                                                         #
 13 | ###########################################################################
 14 | 
 15 | import sys
 16 | import psycopg2
 17 | import socks
 18 | import socket
 19 | import time
 20 | import subprocess
 21 | 
 22 | import util
 23 | from config import *
 24 | 
 25 | 
 26 | USER_AGENT = "Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)"
 27 | CYMRU_TIMEOUT = 1  # Timeout for cymru dig call
 28 | 
 29 | 
 30 | def ip2asn(dump_id):
 31 |     # Connect to database
 32 |     try:
 33 |         conn = psycopg2.connect("dbname=%s host=%s user=%s password=%s"
 34 |             % (db_name, db_host, db_user, db_password))
 35 |     except:
 36 |         print "Unable to connect to database: " + db_name
 37 | 
 38 |     # Use Autocommit mode for database connection
 39 |     conn.set_isolation_level(0)
 40 |     cursor = conn.cursor()
 41 | 
 42 |     # Setup SOCKS proxy
 43 |     if socks_proxy_host:
 44 |         socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5,
 45 |                         socks_proxy_host, socks_proxy_port)
 46 |         socket.socket = socks.socksocket
 47 |     ###
 48 | 
 49 |     # Database query to get the relevant recent record
 50 |     cursor.execute("""
 51 |         SELECT server, timestamp FROM pe_dumps WHERE dump_id = %s
 52 |             """, (dump_id,))
 53 |     row = cursor.fetchone()
 54 |     server_ip = row[0]
 55 |     # Exit if an AS containing this IP has been logged with in the last 1 month
 56 |     cursor.execute("""
 57 |         SELECT * FROM bgp2asn WHERE log_date > (current_date - interval '1 month')
 58 |         AND bgp_prefix >> %s """, (server_ip,))
 59 |     if cursor.rowcount > 0:
 60 |         return
 61 | 
 62 |     # Query whois.cymru.com
 63 |     #cmd = subprocess.Popen(['whois','-h','whois.cymru.com','-v',
 64 |     #       server_ip], stdout = subprocess.PIPE)
 65 |     #as_info = cmd.stdout
 66 |     #for line in as_info:
 67 |     #   if(server_ip in line):
 68 |     #       output = line.split('|')
 69 |     #       break
 70 |     #words=[]
 71 |     #for word in output:
 72 |     #   words.append(word.strip())
 73 | 
 74 |     # Query asn.cymru.com using dig
 75 |     # A sample output is:
 76 |     #     "701 1239 3549 3561 7132 | 216.90.108.0/24 | US | arin | 1998-09-25"
 77 |     print "making call"
 78 |     cmd = subprocess.Popen(['dig', '+short', util.reverse_ip(server_ip) +
 79 |                     '.origin.asn.cymru.com', 'TXT'], stdout=subprocess.PIPE)
 80 |     time.sleep(CYMRU_TIMEOUT)
 81 |     if cmd.poll() is None:
 82 |         cmd.kill()
 83 |         return
 84 |     as_info = cmd.stdout.readline()
 85 |     as_info = as_info.strip().strip('"')
 86 |     output = as_info.split('|')
 87 |     words = []
 88 |     for answer in output:
 89 |         if answer:
 90 |             words.append(answer.split()[0].strip())
 91 |         else:
 92 |             words.append(None)
 93 | 
 94 |     #print words
 95 |     as_number = words[0]
 96 |     bgp_prefix = words[1]
 97 |     country_code = words[2]
 98 |     date_allocated = words[4]
 99 | 
100 |     # Sample output:
101 |     #     "23028 | US | arin | 2002-01-04 | TEAMCYMRU - SAUNET"
102 |     cmd = subprocess.Popen(['dig', '+short', 'AS' + as_number + '.asn.cymru.com',
103 |                             'TXT'], stdout=subprocess.PIPE)
104 |     time.sleep(CYMRU_TIMEOUT)
105 |     if cmd.poll() is None:
106 |         cmd.kill()
107 |         print ("ip2asn.py: Couldn't finish the call to cymru for {0}. Aborting..."
108 |                 .format((server_ip,)))
109 |         return
110 |     as_info = cmd.stdout.readline()
111 |     as_info = as_info.strip().strip('"')
112 |     output = as_info.split('|')
113 |     words = []
114 |     for word in output:
115 |         words.append(word.strip())
116 |     print words
117 |     as_name = words[4]
118 | 
119 |     # Store the record in the database
120 |     cursor.execute("""
121 |         INSERT INTO
122 |             bgp2asn
123 |             (bgp_prefix, as_number, as_name, country_code,
124 |              date_allocated, log_date)
125 |         VALUES (%s,%s,%s,%s,%s,current_date)"""
126 |         , (bgp_prefix, as_number, as_name, country_code,
127 |           date_allocated))
128 | 
129 |     cursor.close()
130 |     conn.close()
131 | 
132 | if __name__ == "__main__":
133 |     ip2asn(sys.argv[1])
134 | 


--------------------------------------------------------------------------------
/amico_scripts/db_setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | ###########################################################################
  3 | # Copyright (C) 2011 Phani Vadrevu                                        #
  4 | # pvadrevu@uga.edu                                                        #
  5 | #                                                                         #
  6 | # Distributed under the GNU Public License                                #
  7 | # http://www.gnu.org/licenses/gpl.txt                                     #
  8 | #                                                                         #
  9 | # This program is free software; you can redistribute it and/or modify    #
 10 | # it under the terms of the GNU General Public License as published by    #
 11 | # the Free Software Foundation; either version 2 of the License, or       #
 12 | # (at your option) any later version.                                     #
 13 | #                                                                         #
 14 | ###########################################################################
 15 | 
 16 | import psycopg2
 17 | import os
 18 | import config
 19 | import re
 20 | 
 21 | from config import *
 22 | from fe_db_setup import fe_db_setup
 23 | 
 24 | # Reorder the subdomains in the host name such that
 25 | # the TLD comes first. Eg: com.google.www
 26 | def reorder_domain(host):
 27 |     host = host.split(':')[0] # in case host string contains port 
 28 | 
 29 |     ordered_host = ""
 30 |     host += '.'
 31 |     domains = re.findall('.*?\.',host)
 32 |     for i in range(len(domains)):
 33 |         ordered_host += domains[len(domains)-i-1]
 34 |     ordered_host = ordered_host[:-1]
 35 |     return ordered_host
 36 |  
 37 | # Connect to database
 38 | try:
 39 |     conn = psycopg2.connect("dbname=%s host=%s user=%s password=%s"
 40 |         %(db_name,db_host,db_user,db_password))
 41 | except:
 42 |     print "Unable to connect to database: "+db_name
 43 | 
 44 | conn.set_isolation_level(0)
 45 | cursor = conn.cursor()
 46 | 
 47 | try:
 48 |     cursor.execute("""
 49 |         CREATE TABLE pe_dumps( dump_id SERIAL,PRIMARY KEY(dump_id),
 50 |         sha1 VARCHAR(40),md5 VARCHAR(32),timestamp TIMESTAMP, server INET,
 51 |         client INET,method VARCHAR(10),url VARCHAR(512),host VARCHAR(256),
 52 |         referer VARCHAR(512),server_application VARCHAR(64),
 53 |         content_type VARCHAR(128),dst_port INT,corrupt BOOLEAN,
 54 |         file_size INT,file_type VARCHAR(5))
 55 |         """)
 56 | except psycopg2.DatabaseError as e:
 57 |     print e
 58 | try:
 59 |     cursor.execute("CREATE INDEX pd_sha1_index ON pe_dumps(sha1)")
 60 | except psycopg2.DatabaseError as e:
 61 |     print e
 62 | try:
 63 |     cursor.execute("CREATE INDEX pd_md5_index ON pe_dumps(md5)")
 64 | except psycopg2.DatabaseError as e:
 65 |     print e
 66 | try:
 67 |     cursor.execute("CREATE INDEX pd_host_index ON pe_dumps(host)")
 68 | except psycopg2.DatabaseError as e:
 69 |     print e
 70 | try:
 71 |     cursor.execute("""
 72 |         CREATE TABLE virus_total_scans(vt_id SERIAL,PRIMARY KEY(vt_id),
 73 |         sha1 VARCHAR(40),md5 VARCHAR(32),json TEXT,num_av_labels INT,
 74 |         trusted_av_labels INT,scan_time TIMESTAMP,query_time TIMESTAMP,
 75 |         first_seen TIMESTAMP)
 76 |         """)
 77 | except psycopg2.DatabaseError as e:
 78 |     print e
 79 | 
 80 | try:
 81 |     cursor.execute("""
 82 |         CREATE TABLE virus_total_submissions(
 83 |             vt_submit_id SERIAL,
 84 |             PRIMARY KEY(vt_submit_id),
 85 |             submit_time TIMESTAMP,
 86 |             sha1 VARCHAR(40),
 87 |             md5 VARCHAR(32),
 88 |             json TEXT,
 89 |             num_av_labels INT,
 90 |             trusted_av_labels INT,
 91 |             scan_time TIMESTAMP,
 92 |             scan_id VARCHAR(75),
 93 |             resubmit_id INT REFERENCES virus_total_submissions(vt_submit_id))
 94 |         """)
 95 | except psycopg2.DatabaseError as e:
 96 |     print e
 97 | try:
 98 |     cursor.execute("""
 99 |         CREATE TABLE ped_vts_mapping (dump_id INT REFERENCES pe_dumps(dump_id),
100 |         vt_id INT REFERENCES virus_total_scans(vt_id))
101 |         """)
102 | except psycopg2.DatabaseError as e:
103 |     print e
104 | 
105 | try:
106 |     cursor.execute("CREATE INDEX vt_sha1_index ON virus_total_scans(sha1)")
107 | except psycopg2.DatabaseError as e:
108 |     print e
109 | try:
110 |     cursor.execute("CREATE INDEX vt_md5_index ON virus_total_scans(md5)")
111 | except psycopg2.DatabaseError as e:
112 |     print e
113 | 
114 | try:
115 |     cursor.execute("""
116 |         CREATE TABLE manual_download_checksums(dump_id INT REFERENCES pe_dumps(dump_id),
117 |         sha1 VARCHAR(40), md5 VARCHAR(32), different BOOLEAN, referer_exists BOOLEAN,
118 |         timestamp TIMESTAMP, is_pe BOOLEAN);
119 |         """)
120 | except psycopg2.DatabaseError as e:
121 |     print e
122 | 
123 | try:
124 |     cursor.execute("""
125 |             CREATE TABLE bgp2asn(bgp_prefix INET, as_number INT, as_name VARCHAR(512),
126 |             country_code VARCHAR(2), date_allocated DATE, log_date DATE)
127 |                """)
128 | except psycopg2.DatabaseError as e:
129 |     print e
130 | 
131 | try:
132 |     cursor.execute("""
133 |         CREATE TABLE amico_scores(
134 |             dump_id INT PRIMARY KEY REFERENCES pe_dumps(dump_id),
135 |             score REAL)
136 |         """)
137 | except psycopg2.DatabaseError as e:
138 |     print e
139 | 
140 | print("""Created tables: pe_dumps, virus_total_scans, manual_download_checksums,
141 |                          bgp2asn, amico_scores""")
142 | 
143 | fe_db_setup()
144 | cursor.close()
145 | conn.close()
146 | 


--------------------------------------------------------------------------------
/amico_scripts/db_virus_total.py:
--------------------------------------------------------------------------------
  1 | ###########################################################################
  2 | # Copyright (C) 2011 Phani Vadrevu and Roberto Perdisci                   #
  3 | # pvadrevu@uga.edu                                                        #
  4 | #                                                                         #
  5 | # Distributed under the GNU Public License                                #
  6 | # http://www.gnu.org/licenses/gpl.txt                                     #
  7 | #                                                                         #
  8 | # This program is free software; you can redistribute it and/or modify    #
  9 | # it under the terms of the GNU General Public License as published by    #
 10 | # the Free Software Foundation; either version 2 of the License, or       #
 11 | # (at your option) any later version.                                     #
 12 | #                                                                         #
 13 | ###########################################################################
 14 | 
 15 | import sys
 16 | from datetime import datetime, timedelta, MINYEAR
 17 | import time
 18 | 
 19 | import simplejson
 20 | import logging
 21 | import logging.config
 22 | 
 23 | import util
 24 | import vt_api
 25 | from config import trusted_av_vendors
 26 | 
 27 | LOG_CONF_FILE = "logging.conf"
 28 | # Do not make a new query for the same sha1 if the previous query was made
 29 | # with in VT_QUERY_INTERVAL (in days)
 30 | VT_QUERY_INTERVAL = 1
 31 | MAX_TRIES = 3
 32 | 
 33 | 
 34 | def insert_report(cursor, report, sha1, md5, json, dump_id):
 35 |     scan_time = report["scan_date"]
 36 |     scans = report["scans"]
 37 |     num_av_labels = report["positives"]
 38 |     trusted_av_labels = 0
 39 |     for k, v in scans.iteritems():
 40 |         if v["detected"] is True:
 41 |             if k in trusted_av_vendors:
 42 |                 trusted_av_labels += 1
 43 |     scan_time += " UTC"
 44 |     cursor.execute("""
 45 |             INSERT INTO virus_total_scans(sha1,md5,json,num_av_labels,
 46 |             trusted_av_labels,scan_time,query_time)
 47 |             VALUES (%s,%s,%s,%s,%s,TIMESTAMP WITH TIME ZONE %s,
 48 |                     CLOCK_TIMESTAMP())
 49 |             RETURNING vt_id
 50 |             """, (sha1, md5, json, num_av_labels,
 51 |                  trusted_av_labels, scan_time))
 52 |     vt_id = cursor.fetchone()[0]
 53 | 
 54 |     cursor.execute("""
 55 |             INSERT INTO ped_vts_mapping (dump_id, vt_id)
 56 |             VALUES (%s, %s)""",
 57 |             (dump_id, vt_id))
 58 |     print "Virus Total: Scan report found. Entry has been made into"
 59 |     print "virus_total_scans table"
 60 | 
 61 | 
 62 | def db_virus_total(dump_id):
 63 |     logging.config.fileConfig(LOG_CONF_FILE)
 64 |     logger = logging.getLogger("amico_logger")
 65 |     util.setup_socks()
 66 |     conn = util.connect_to_db()
 67 |     cursor = conn.cursor()
 68 | 
 69 |     # Exit if this sha1 has been queried in the past VT_QUERY_INTERVAL period
 70 |     prev_query_time = datetime(MINYEAR, 1, 1, 0, 0, 0, 0)
 71 |     time_now = datetime.now()
 72 |     cursor.execute("""
 73 |         SELECT sha1, md5
 74 |         FROM pe_dumps
 75 |         WHERE dump_id = %s""",
 76 |         (dump_id,))
 77 |     (sha1, md5) = cursor.fetchone()
 78 | 
 79 |     try:
 80 |         cursor.execute("SELECT query_time, vt_id FROM virus_total_scans "
 81 |                    "WHERE sha1 = %s "
 82 |                    "ORDER by query_time DESC", (sha1,))
 83 |         res = cursor.fetchone()
 84 |         if res:
 85 |             prev_query_time = res[0]
 86 |             vt_id = res[1]
 87 |     except:
 88 |         print "sha1:%s no previous VT query" % (sha1, )
 89 |         pass
 90 | 
 91 |     vt_query_period = timedelta(days=VT_QUERY_INTERVAL)
 92 |     if (time_now - prev_query_time) < vt_query_period:
 93 |         print "sha1:%s has been queried recently. Skipping..." % (sha1, )
 94 |         cursor.execute("""
 95 |                 INSERT INTO ped_vts_mapping (dump_id, vt_id)
 96 |                 VALUES (%s, %s)""",
 97 |                 (dump_id, vt_id))
 98 |         conn.close()
 99 |         return
100 | 
101 |     tries = 0
102 |     success = False
103 |     while tries < MAX_TRIES:
104 |         try:
105 |             tries += 1
106 |             json = vt_api.get_vt_report(md5)
107 |             if not json:
108 |                 continue
109 |             report = simplejson.loads(json)
110 |             if report["response_code"] == 1:
111 |                 insert_report(cursor, report, sha1, md5, json, dump_id)
112 |                 success = True
113 |                 break
114 |             elif report["response_code"] == 0:
115 |                 cursor.execute("""
116 |                     INSERT INTO virus_total_scans(sha1, md5, query_time)
117 |                     VALUES (%s, %s, CLOCK_TIMESTAMP())
118 |                     RETURNING vt_id
119 |                     """, (sha1, md5))
120 |                 vt_id = cursor.fetchone()[0]
121 |                 cursor.execute("""
122 |                         INSERT INTO ped_vts_mapping (dump_id, vt_id)
123 |                         VALUES (%s, %s)""",
124 |                         (dump_id, vt_id))
125 |                 print "Virus Total: No scan report exists in the VT database"
126 |                 success = True
127 |                 break
128 |             else:
129 |                 logger.exception("Unknown response code! %s" %
130 |                         (report["response_code"],))
131 |                 time.sleep(1)
132 | 
133 |         except Exception as e:
134 |             print e
135 |             logger.exception("Try %s. Error in fetching report for md5 %s: %s"
136 |                             % (tries, md5, e))
137 |             time.sleep(5)
138 |     if not success:
139 |         cursor.execute("""
140 |                 INSERT INTO ped_vts_mapping (dump_id)
141 |                 VALUES (%s)""",
142 |                 (dump_id,))
143 |         logger.warning("Giving up on dump_id: %s's VT report" % (dump_id,))
144 |     cursor.close()
145 |     conn.close()
146 | 
147 | if __name__ == "__main__":
148 |     db_virus_total(sys.argv[1])
149 | 


--------------------------------------------------------------------------------
/amico_scripts/extract_file.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | ###########################################################################
  4 | # Copyright (C) 2011 Roberto Perdisci                                     #
  5 | # perdisci@cs.uga.edu                                                     #
  6 | #                                                                         #
  7 | # Distributed under the GNU Public License                                #
  8 | # http://www.gnu.org/licenses/gpl.txt                                     #   
  9 | #                                                                         #
 10 | # This program is free software; you can redistribute it and/or modify    #
 11 | # it under the terms of the GNU General Public License as published by    #
 12 | # the Free Software Foundation; either version 2 of the License, or       #
 13 | # (at your option) any later version.                                     #
 14 | #                                                                         #
 15 | ###########################################################################
 16 | 
 17 | import sys, os
 18 | import re
 19 | from struct import unpack
 20 | from config import capture_file_types
 21 | 
 22 | def prune_http_resp_headers(data):
 23 |     # finds start of resp header
 24 |     m = re.search("HTTP/\d\.\d\s\d\d\d", data)
 25 |     if m:
 26 |         pos = m.start()
 27 |         data = data[pos:]
 28 |         
 29 |     # now we can search for the end of the response header
 30 |     m = re.search('\r\n\r\n',data)
 31 |     if m:
 32 |         pos = m.start()
 33 |         return data[pos+4:] # returns all data after \r\n\r\n
 34 | 
 35 | 
 36 | def is_pe_file(bin_data):
 37 |     if bin_data[0:2] == 'MZ':
 38 |         offset = unpack('i', bin_data[0x3c:0x3c+4])[0]
 39 |         if bin_data[offset:offset+2] == 'PE':
 40 |             # print "This is a PE file!"
 41 |             return True
 42 | 
 43 |     # print "This is NOT a PE file!"
 44 |     return False
 45 | 
 46 | 
 47 | def is_jar_file(bin_data):
 48 |     if bin_data[0:4].encode('hex').upper() == '504B0304':
 49 |         # print "Searching for manifest.mf"
 50 |         regex = re.compile('MANIFEST.MF',re.IGNORECASE)
 51 |         m = regex.search(bin_data)
 52 |         if m: 
 53 |             # print "Found manifest!"
 54 |             return True
 55 | 
 56 | 
 57 | def is_apk_file(bin_data):
 58 |     if bin_data[0:4].encode('hex').upper() == '504B0304':
 59 |         # print "Searching for AndroidManifest.xml"
 60 |         regex = re.compile('AndroidManifest.xml',re.IGNORECASE)
 61 |         m = regex.search(bin_data)
 62 |         if m:
 63 |             # print "Found Android Manifest!"
 64 |             return True
 65 | 
 66 | 
 67 | def is_elf_file(bin_data):
 68 |     if bin_data[0].encode('hex').upper() == '7F':
 69 |         if bin_data[1:4] == 'ELF':
 70 |             return True
 71 |     return False 
 72 | 
 73 | 
 74 | def is_pdf_file(bin_data):
 75 |     if bin_data[0:4] == '%PDF':
 76 |         return True
 77 |     return False
 78 | 
 79 | 
 80 | def is_rar_file(bin_data):
 81 |     if bin_data[0:4] == 'Rar!':
 82 |         return True
 83 |     return False
 84 | 
 85 | 
 86 | def is_zip_file(bin_data):
 87 |     if bin_data[0:4].encode('hex').upper() == '504B0304':
 88 |         return True
 89 |     return False
 90 | 
 91 | 
 92 | def is_swf_file(bin_data):
 93 |     magicstr = bin_data[0:3].encode('hex')
 94 |     if magicstr == '465753' or magicstr == '435753' or magicstr == '5A5753':
 95 |         return True
 96 |     return False
 97 | 
 98 | 
 99 | def is_msdoc_file(bin_data):
100 |     # msdocx_magic[] = {0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x06, 0x00};
101 |     # msdoc_magic[]  = {0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1};
102 |     magicstr = bin_data[0:8].encode('hex')
103 |     if magicstr == '504B030414000600':
104 |         return True
105 |     if magicstr == 'D0CF11E0A1B11AE1':
106 |         return True
107 |     return False
108 | 
109 | 
110 | def is_dmg_file(bin_data):
111 | 
112 |     magicstr = bin_data[0:1].encode('hex')
113 |     if magicstr == '78' or bin_data[0:3] == 'BZh':
114 |         regex = re.compile('koly',re.IGNORECASE)
115 |         m = regex.search(bin_data)
116 |         if m:
117 |             # print "Found koly!"
118 |             return True
119 |     return False
120 | 
121 | 
122 | def extract_file_type(data):
123 | 
124 |     file_type = None
125 | 
126 |     if not file_type and is_pe_file(data):
127 |         file_type = "EXE"
128 | 
129 |     if not file_type and is_jar_file(data):
130 |         file_type = "JAR"
131 | 
132 |     if (not file_type or file_type=="JAR") and is_apk_file(data):
133 |         file_type = "APK"
134 | 
135 |     if not file_type and is_elf_file(data):
136 |         file_type = "ELF"
137 | 
138 |     if not file_type and is_dmg_file(data):
139 |         file_type = "DMG"
140 | 
141 |     if not file_type and is_msdoc_file(data):
142 |         file_type = "MSDOC"
143 | 
144 |     if not file_type and is_rar_file(data):
145 |         file_type = "RAR"
146 | 
147 |     if not file_type and is_swf_file(data):
148 |         file_type = "SWF"
149 | 
150 |     if not file_type and is_pdf_file(data):
151 |         file_type = "PDF"
152 | 
153 |     if not file_type and is_zip_file(data):
154 |         # notice that this is more generic than other
155 |         # derived file formats (e.g., JAR, DOCX, etc.)
156 |         # and therefore this check should run last!
157 |         file_type = "ZIP"
158 | 
159 |     return file_type
160 | 
161 | 
162 | 
163 | def extract_file(flow_file, dst=None):
164 | 
165 |     if not dst:
166 |         dst = flow_file
167 | 
168 |     f = open(flow_file, 'rb')
169 |     data = f.read()
170 |     f.close()
171 | 
172 |     data = prune_http_resp_headers(data)
173 | 
174 |     file_type = None
175 |     file_extension = ''
176 | 
177 |     if not file_type and is_pe_file(data):
178 |         file_type = "EXE"
179 |         file_extension = "exe"
180 | 
181 |     if not file_type and is_jar_file(data):
182 |         file_type = "JAR"
183 |         file_extension = "jar"
184 | 
185 |     if (not file_type or file_type=="JAR") and is_apk_file(data):
186 |         file_type = "APK"
187 |         file_extension = "apk"
188 | 
189 |     if not file_type and is_elf_file(data):
190 |         file_type = "ELF"
191 |         file_extension = "elf"
192 | 
193 |     if not file_type and is_dmg_file(data):
194 |         file_type = "DMG"
195 |         file_extension = "dmg"
196 | 
197 |     if not file_type and is_msdoc_file(data):
198 |         file_type = "MSDOC"
199 |         file_extension = "msdoc" # generic for DOC(X), PPT(X), XLS(X), etc.
200 | 
201 |     if not file_type and is_rar_file(data):
202 |         file_type = "RAR"
203 |         file_extension = "rar"
204 | 
205 |     if not file_type and is_swf_file(data):
206 |         file_type = "SWF"
207 |         file_extension = "swf"
208 | 
209 |     if not file_type and is_pdf_file(data):
210 |         file_type = "PDF"
211 |         file_extension = "pdf"
212 | 
213 |     if not file_type and is_zip_file(data): 
214 |         # notice that this is more generic than other 
215 |         # derived file formats (e.g., JAR, DOCX, etc.)
216 |         # and therefore this check should run last!
217 |         file_type = "ZIP"
218 |         file_extension = "zip"
219 | 
220 |     if file_type in capture_file_types:
221 |         dst = dst+'.'+file_extension 
222 |         print "Writing file:", dst
223 |         f = open(dst, 'wb')
224 |         f.write(data)
225 |         f.close()
226 |         print "Finished!"
227 |         return (file_type, dst, file_extension)
228 | 
229 |     return(None, None, None)
230 | 
231 | 
232 | 
233 | if __name__ == '__main__':
234 |     extract_file(sys.argv[1])
235 | 
236 | 
237 | 
238 | 


--------------------------------------------------------------------------------
/amico_scripts/start_amico.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | ###########################################################################
  4 | # Copyright (C) 2014 Phani Vadrevu and Roberto Perdisci                   #
  5 | # pvadrevu@uga.edu                                                        #
  6 | # perdisci@uga.edu                                                        #
  7 | #                                                                         #
  8 | # Distributed under the GNU Public License                                #
  9 | # http://www.gnu.org/licenses/gpl.txt                                     #
 10 | #                                                                         #
 11 | # This program is free software; you can redistribute it and/or modify    #
 12 | # it under the terms of the GNU General Public License as published by    #
 13 | # the Free Software Foundation; either version 2 of the License, or       #
 14 | # (at your option) any later version.                                     #
 15 | #                                                                         #
 16 | ###########################################################################
 17 | from multiprocessing import Process
 18 | import shutil
 19 | import os
 20 | import subprocess
 21 | import hashlib
 22 | import time
 23 | import traceback
 24 | from cachetools import TTLCache
 25 | 
 26 | from config import whitelist_domains, vt_submissions as vts_config
 27 | from vt_submit import vt_submissions_func
 28 | # from pe_extract import pe_extract
 29 | from extract_file import extract_file
 30 | from db_file_dumps import db_file_dumps
 31 | from db_virus_total import db_virus_total
 32 | from manual_download import manual_download
 33 | from ip2asn import ip2asn
 34 | from get_feature_vector import get_feature_vector
 35 | from classify_dump import classify_dump,update_score
 36 | from db_syslog import db_syslog
 37 | 
 38 | WAIT_TIME = 1
 39 | DUMP_DIR = "../file_dump/dumps"
 40 | RAW_DIR = "parsed/raw_files/"
 41 | FILES_DIR = "parsed/captured_files/"
 42 | MD_TIMEOUT = 180
 43 | VT_TIMEOUT = 60
 44 | 
 45 | md5host_cache = TTLCache(100000,ttl=60*10)
 46 | MAX_MD5_CACHE_COUNT = 1
 47 | 
 48 | hostcs_cache = TTLCache(100000,ttl=60*10)
 49 | MAX_HOSTCS_CACHE_COUNT = 3
 50 | 
 51 | # Makes a function call in a separate process
 52 | # and makes sure it times out after 'timeout' seconds
 53 | def process_timeout(func, func_args, timeout):
 54 |     p = Process(target=func, args=(func_args,))
 55 |     p.start()
 56 |     p.join(timeout)
 57 |     p.terminate()
 58 | 
 59 | 
 60 | def is_whitelisted(file_name):
 61 |     with open(file_name) as f:
 62 |         for _ in xrange(6):
 63 |             line = f.readline()
 64 |             if line.startswith("% Host:"):
 65 |                 tok = line.split(':')
 66 |                 if len(tok)>1:
 67 |                     host = tok[1].strip()
 68 |                     for domain in whitelist_domains:
 69 |                         if host == domain or host.endswith('.'+domain):
 70 |                             return True
 71 |     return False
 72 | 
 73 | 
 74 | def get_file_hashes(file_path):
 75 |     with open(file_path, 'rb') as f:
 76 |         cont = f.read()
 77 |         sha1 = hashlib.sha1(cont).hexdigest()
 78 |         md5 = hashlib.md5(cont).hexdigest()
 79 |     file_size = os.stat(file_path).st_size
 80 |     return sha1, md5, file_size
 81 | 
 82 | 
 83 | def process_file(raw_path, file_name):
 84 |     file_type,file_path,file_extension = extract_file(raw_path)
 85 |     print "raw_file:", raw_path
 86 |     print "file_path:", file_path
 87 |     if not file_type:
 88 |         print "This is NOT a file of interest! "
 89 |         print "Removing raw data from disk:", raw_path
 90 |         # remove the related raw file
 91 |         os.remove(raw_path)
 92 |         print "Removed!"
 93 |         return
 94 |     print "file_type:", file_type
 95 | 
 96 |     # If we are really dealing with a PE file
 97 |     sha1, md5, file_size = get_file_hashes(file_path)
 98 |     dump_id, corrupt_pe, host, client, server = db_file_dumps(raw_path, sha1, md5, file_size, file_type)
 99 | 
100 |     skip_classification = False
101 |     score = None
102 | 
103 |     # check if we have already recently classified the same md5 dump from the same host
104 |     md5_cache_key = md5
105 |     if host is not None:
106 |         md5_cache_key += '-'+host
107 |     if md5_cache_key in md5host_cache.keys():
108 |         md5host_cache[md5_cache_key]['count'] += 1
109 |         if md5host_cache[md5_cache_key]['count'] > MAX_MD5_CACHE_COUNT:
110 |             # do not classify again! retrieve cached score
111 |             skip_classification = True
112 |             score = md5host_cache[md5_cache_key]['score'] # get the last cached score
113 |             print "MD5 CACHE: will use previous score : %s %s %s %s" %(dump_id,md5,host,score)
114 |     elif not corrupt_pe:
115 |         md5host_cache[md5_cache_key] = {'count':1, 'score':None}
116 | 
117 |     # check if we have already recently classified several dumps from the same host,client,server
118 |     hostcs_cache_key = ''
119 |     if host is not None:
120 |         hostcs_cache_key += host
121 |     hostcs_cache_key += '-'+client
122 |     hostcs_cache_key += '-'+server
123 |     if hostcs_cache_key in hostcs_cache.keys():
124 |         hostcs_cache[hostcs_cache_key]['count'] += 1
125 |         if hostcs_cache[hostcs_cache_key]['count'] > MAX_HOSTCS_CACHE_COUNT:
126 |             # do not classify again! retrieve cached score
127 |             skip_classification = True
128 |             if score is None:
129 |                 score = hostcs_cache[hostcs_cache_key]['score'] # get the last cached score
130 |                 print "HOSTCS CACHE: will use previous score : %s %s %s %s" %(dump_id,host,server,score)
131 |     elif not corrupt_pe:
132 |         hostcs_cache[hostcs_cache_key] = {'count':1, 'score':None}
133 |     
134 | 
135 |     if not corrupt_pe and (not skip_classification or score is None):
136 |         ip2asn(dump_id)
137 |         get_feature_vector(dump_id,file_type)
138 |         score = classify_dump(dump_id)
139 |         md5host_cache[md5_cache_key]['score'] = score # update cached score
140 |         hostcs_cache[hostcs_cache_key]['score'] = score # update cached score
141 | 
142 |         # query VT
143 |         Process(target=process_timeout,
144 |             args=(db_virus_total, (dump_id,), VT_TIMEOUT)).start()
145 |         if vts_config == "manual": # attempt to re-download the file "manually"
146 |             Process(target=process_timeout,
147 |                 args=(manual_download, sha1, MD_TIMEOUT)).start()
148 | 
149 |     if not corrupt_pe:
150 |         if score is None: print "ERROR : None score : this should not happen! dump_id=", dump_id
151 |         if skip_classification and not score is None:
152 |             update_score(dump_id,score)
153 |         print "Syslog score = %s (dump_id=%s)" % (score, dump_id)
154 |         Process(target=db_syslog, args=(dump_id,score)).start()
155 | 
156 |     sha1_path = os.path.join(
157 |             FILES_DIR, "%s.%s" % (sha1,file_extension))
158 |     md5_path = os.path.join(
159 |             FILES_DIR, "%s.%s" % (md5,file_extension))
160 |     shutil.move(file_path, sha1_path)
161 |     print "sha1_path", sha1_path
162 |     print "md5_path", md5_path
163 |     if not os.path.exists(md5_path):
164 |         os.symlink("%s.%s" % (sha1,file_extension), md5_path)
165 |     print "Done processing file: %s" % (raw_path,)
166 | 
167 | 
168 | def start_amico():
169 |     Process(target=vt_submissions_func).start()
170 |     print "Started amico_scripts"
171 |     while True:
172 |         p = subprocess.Popen(
173 |                 'ls -atr %s |egrep "\:[0-9]+\-[0-9]+$" | egrep -v "\.tmp$"' %
174 |                 (DUMP_DIR,),
175 |                 stdout=subprocess.PIPE, shell=True)
176 |         output = p.communicate()[0]
177 |         file_names = [i.strip() for i in output.split('\n') if i.strip() != '']
178 |         for file_name in file_names:
179 |             file_path = os.path.join(DUMP_DIR, file_name)
180 |             if not is_whitelisted(file_path):
181 |                 raw_path = os.path.join(RAW_DIR, file_name)
182 |                 shutil.copy(file_path, RAW_DIR)
183 |                 try:
184 |                     process_file(raw_path, file_name)
185 |                 except Exception as e:
186 |                     print "Exception in processing file %s" % (raw_path,)
187 |                     print e
188 |                     traceback.print_exc()
189 |             else:
190 |                 print "domain in %s is whitelisted. Ignoring..." % (file_path,)
191 |             os.remove(file_path)
192 |         time.sleep(WAIT_TIME)
193 | 
194 | if __name__ == "__main__":
195 |     start_amico()
196 | 


--------------------------------------------------------------------------------
/amico_scripts/trainer.py:
--------------------------------------------------------------------------------
  1 | ###########################################################################
  2 | # Copyright (C) 2014 Phani Vadrevu                                        #
  3 | # pvadrevu@uga.edu                                                        #
  4 | #                                                                         #
  5 | # Distributed under the GNU Public License                                #
  6 | # http://www.gnu.org/licenses/gpl.txt                                     #
  7 | #                                                                         #
  8 | # This program is free software; you can redistribute it and/or modify    #
  9 | # it under the terms of the GNU General Public License as published by    #
 10 | # the Free Software Foundation; either version 2 of the License, or       #
 11 | # (at your option) any later version.                                     #
 12 | #                                                                         #
 13 | ###########################################################################
 14 | from datetime import timedelta, date, datetime
 15 | import psycopg2.extras
 16 | import psycopg2.extensions
 17 | import subprocess
 18 | import sys
 19 | import os
 20 | 
 21 | from train_config import training_days, training_start_date
 22 | from features import features
 23 | import util
 24 | 
 25 | 
 26 | class Trainer:
 27 |     def __init__(self,):
 28 |         self.output_file = "train.arff"
 29 |         self.conn = util.connect_to_db()
 30 |         self.conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_READ_COMMITTED)
 31 |         self.clean_label_delta = timedelta(days=30)
 32 |         self.training_end_date = date.today()
 33 |         if training_start_date:
 34 |             self.training_start_date = datetime.strptime(training_start_date,
 35 |                     "%Y-%m-%d")
 36 |         else:
 37 |             cursor = self.conn.cursor()
 38 |             cursor.execute("""
 39 |                     SELECT MIN(timestamp)
 40 |                     FROM pe_dumps""")
 41 |             if cursor.rowcount > 0:
 42 |                 self.training_start_date = cursor.fetchone()[0].date()
 43 |             else:
 44 |                 print "No entries in the database to train!"
 45 |                 sys.exit()
 46 |             cursor.close()
 47 |         if training_days:
 48 |             self.training_end_date = (self.training_start_date +
 49 |                     timedelta(days=training_days))
 50 |         print "Training start date:", self.training_start_date.strftime("%B %d, %Y")
 51 |         print "Training end date:", self.training_end_date.strftime("%B %d, %Y")
 52 | 
 53 |     def count(self,):
 54 |         self.benign_dumps = self.get_benign_dumps()
 55 |         self.malicious_dumps = self.get_malicious_dumps()
 56 |         print "# benign dumps", len(self.benign_dumps)
 57 |         print "# malware dumps", len(self.malicious_dumps)
 58 | 
 59 |     def train(self,):
 60 |         model_name = datetime.today().strftime("%b%d_%y_%H%M%S")
 61 |         model_output_file = "models/%s.model" % (model_name,)
 62 |         self.benign_dumps = self.get_benign_dumps()
 63 |         self.malicious_dumps = self.get_malicious_dumps()
 64 |         print "# benign dumps", len(self.benign_dumps)
 65 |         print "# malware dumps", len(self.malicious_dumps)
 66 |         self.print_arff()
 67 |         subprocess.call("""
 68 |             java -Xmx2000m -cp ./weka.jar weka.classifiers.meta.FilteredClassifier -t train.arff -d %s -p 1,58,59 -distribution -F "weka.filters.unsupervised.attribute.RemoveType -T string" -W weka.classifiers.trees.RandomForest -- -K 0 -S 1 -I 50 > logs/training/%s.log
 69 |             """ % (model_output_file, model_name), shell=True)
 70 |         print "New model trained: %s" % (model_output_file,)
 71 |         print "Log file: logs/training/%s.log" % (model_name,)
 72 |         os.remove("train.arff")
 73 | 
 74 |     def get_arff_line(self, dump_id, is_benign):
 75 |         self.cursor = self.conn.cursor(
 76 |                 cursor_factory=psycopg2.extras.NamedTupleCursor)
 77 |         values = []
 78 |         self.cursor.execute("""
 79 |             SELECT * FROM weka_features
 80 |             WHERE dump_id = %s""",
 81 |             (dump_id, ))
 82 |         if self.cursor.rowcount == 0:
 83 |             return
 84 |         res = self.cursor.fetchone()
 85 |         res = res._asdict()
 86 |         for feature in features:
 87 |             values.append(res[feature])
 88 |         try:
 89 |             data_string = ','.join(['?' if (value is None or value is '') else
 90 |                 str(value) for value in values])
 91 |         except Exception as e:
 92 |             print "Error in generating the feature vector in ARFF", e
 93 |             return
 94 |         if is_benign:
 95 |             data_string += ",neg"
 96 |         else:
 97 |             data_string += ",pos"
 98 |         self.cursor.close()
 99 |         return data_string
100 | 
101 |     def print_arff(self,):
102 |         w = open(self.output_file, 'w')
103 |         w.write('@RELATION train\n\n')
104 |         for feature in features:
105 |             if feature in ['sha1', 'dump_id', 'host', 'corrupt',
106 |                     'vt_month_shelf', 'url_struct']:
107 |                 data_type = "STRING"
108 |             elif feature == "extension_class":
109 |                 data_type = ("{common_ext,unknown_ext,common_fake,other_ext,"
110 |                        "no_url,no_ext}")
111 |             else:
112 |                 data_type = "NUMERIC"
113 |             w.write('@ATTRIBUTE %s %s\n' % (feature, data_type))
114 |             #print "%s : %s" % (key, res[key])
115 | 
116 |         w.write('@ATTRIBUTE class {pos, neg}\n\n')
117 |         w.write('@DATA\n\n')
118 |         for dump_id in self.benign_dumps:
119 |             arff_line = self.get_arff_line(dump_id, True)
120 |             if arff_line:
121 |                 w.write(arff_line + '\n')
122 |         for dump_id in self.malicious_dumps:
123 |             arff_line = self.get_arff_line(dump_id, False)
124 |             if arff_line:
125 |                 w.write(arff_line + '\n')
126 |         w.close()
127 | 
128 |     def get_benign_dumps(self,):
129 |         self.cursor = self.conn.cursor()
130 |         self.cursor.execute("""
131 |             SELECT DISTINCT(sha1)
132 |             FROM
133 |                 virus_total_scans as vts JOIN
134 |                 virus_total_submissions as vt_sub
135 |                 USING (sha1)
136 |             WHERE
137 |                 vt_sub.scan_time - vts.scan_time > %s
138 |                 AND vt_sub.num_av_labels = 0
139 |             """, (self.clean_label_delta,))
140 |         hashes = set(self.cursor.fetchall())
141 |         self.cursor.execute("""
142 |             SELECT DISTINCT(sha1)
143 |             FROM
144 |                 virus_total_submissions as vts JOIN
145 |                 virus_total_submissions as vt_sub
146 |                 USING (sha1)
147 |             WHERE
148 |                 vt_sub.scan_time - vts.scan_time > %s
149 |                 AND vt_sub.num_av_labels = 0
150 |             """, (self.clean_label_delta,))
151 |         hashes.update(self.cursor.fetchall())
152 |         dumps = set()
153 |         for sha1 in hashes:
154 |             self.cursor.execute("""
155 |                 SELECT dump_id
156 |                 FROM pe_dumps
157 |                 WHERE timestamp >= %s AND
158 |                 timestamp <= %s AND
159 |                 sha1 = %s
160 |                 """, (self.training_start_date, self.training_end_date,
161 |                     sha1))
162 |             dumps.update(self.cursor.fetchall())
163 |         self.cursor.close()
164 |         return dumps
165 | 
166 |     def get_malicious_dumps(self,):
167 |         self.cursor = self.conn.cursor()
168 |         self.cursor.execute("""
169 |             SELECT DISTINCT(sha1)
170 |             FROM
171 |                 virus_total_scans as vts JOIN
172 |                 virus_total_submissions as vt_sub
173 |                 USING (sha1)
174 |             WHERE
175 |                 vt_sub.scan_time - vts.scan_time > %s
176 |                 AND vt_sub.trusted_av_labels >= 2
177 |             """, (self.clean_label_delta,))
178 |         hashes = set(self.cursor.fetchall())
179 |         self.cursor.execute("""
180 |             SELECT DISTINCT(sha1)
181 |             FROM
182 |                 virus_total_submissions as vts JOIN
183 |                 virus_total_submissions as vt_sub
184 |                 USING (sha1)
185 |             WHERE
186 |                 vt_sub.scan_time - vts.scan_time > %s
187 |                 AND vt_sub.trusted_av_labels >= 2
188 |             """, (self.clean_label_delta,))
189 |         hashes.update(self.cursor.fetchall())
190 |         dumps = set()
191 |         for sha1 in hashes:
192 |             self.cursor.execute("""
193 |                 SELECT dump_id
194 |                 FROM pe_dumps
195 |                 WHERE timestamp >= %s AND
196 |                 timestamp <= %s AND
197 |                 sha1 = %s
198 |                 """, (self.training_start_date, self.training_end_date,
199 |                     sha1))
200 |             dumps.update(self.cursor.fetchall())
201 |         self.cursor.close()
202 |         return dumps
203 | 
204 | if __name__ == "__main__":
205 |     trainer = Trainer()
206 |     if len(sys.argv) > 1 and sys.argv[1] == "-c":
207 |         trainer.count()
208 |     else:
209 |         trainer.train()
210 | 


--------------------------------------------------------------------------------
/amico_scripts/vt_submit.py:
--------------------------------------------------------------------------------
  1 | ###########################################################################
  2 | # Copyright (C) 2011-2013 Phani Vadrevu and Roberto Perdisci              #
  3 | # pvadrevu@uga.edu                                                        #
  4 | #                                                                         #
  5 | # Distributed under the GNU Public License                                #
  6 | # http://www.gnu.org/licenses/gpl.txt                                     #
  7 | #                                                                         #
  8 | # This program is free software; you can redistribute it and/or modify    #
  9 | # it under the terms of the GNU General Public License as published by    #
 10 | # the Free Software Foundation; either version 2 of the License, or       #
 11 | # (at your option) any later version.                                     #
 12 | #                                                                         #
 13 | ###########################################################################
 14 | 
 15 | import sys
 16 | from datetime import timedelta, date
 17 | import time
 18 | 
 19 | import simplejson
 20 | import logging
 21 | import logging.config
 22 | 
 23 | from config import *
 24 | import vt_api
 25 | import util
 26 | 
 27 | LOG_CONF_FILE = "logging.conf"
 28 | 
 29 | class VTSubmissions:
 30 |     def __init__(self):
 31 |         self.QUERY_RATE_LIMIT = 10
 32 |         self.ONE_MIN = 60
 33 | 
 34 |         logging.config.fileConfig(LOG_CONF_FILE)
 35 |         self.logger = logging.getLogger("amico_logger")
 36 |         #stdout_handler = logging.StreamHandler(sys.stdout)
 37 |         #stdout_handler.setLevel(logging.DEBUG)
 38 |         #formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s'
 39 |                                #'- %(message)s')
 40 |         #stdout_handler.setFormatter(formatter)
 41 |         #self.logger.addHandler(stdout_handler)
 42 | 
 43 |         util.setup_socks()
 44 |         self.conn = util.connect_to_db()
 45 |         self.cursor = self.conn.cursor()
 46 | 
 47 |         self.today = date.today().strftime("%Y-%m-%d")
 48 |         self.yesterday = (date.today() -
 49 |                 timedelta(days=1)).strftime("%Y-%m-%d")
 50 |         self.last_month = (date.today() -
 51 |                 timedelta(days=30)).strftime("%Y-%m-%d")
 52 | 
 53 |     def get_hashes_from_db(self):
 54 |         if vt_submissions == "manual":
 55 |             hashes = self.get_hashes_from_db_manual()
 56 |         elif vt_submissions == "live":
 57 |             hashes = self.get_hashes_from_db_live()
 58 |         else:
 59 |             hashes = self.get_hashes_from_db_scans()
 60 | 
 61 |         self.logger.debug("get_hashes_from_db(): Yesterday's hahses: %s", len(hashes))
 62 |         self.hashes = self.update_hashes(hashes)
 63 | 
 64 |     def update_hashes(self, hashes):
 65 |         self.cursor.execute("""
 66 |             SELECT distinct md5, sha1
 67 |             FROM virus_total_submissions
 68 |             WHERE (submit_time::date) = %s
 69 |             """, (self.last_month,))
 70 |         if self.cursor.rowcount > 0:
 71 |             hashes = hashes.union(self.cursor.fetchall())
 72 |         self.cursor.execute("""
 73 |             SELECT distinct md5, sha1
 74 |             FROM virus_total_submissions
 75 |             WHERE (submit_time::date) > %s AND
 76 |             (submit_time::date) < %s
 77 |             """, (self.last_month, self.yesterday))
 78 |         if self.cursor.rowcount > 0:
 79 |             hashes = hashes.difference(self.cursor.fetchall())
 80 |         self.cursor.execute("""
 81 |             SELECT distinct md5, sha1
 82 |             FROM virus_total_submissions
 83 |             WHERE (submit_time::date) = %s
 84 |             """, (self.today,))
 85 |         if self.cursor.rowcount > 0:
 86 |             hashes = hashes.difference(self.cursor.fetchall())
 87 |         self.logger.debug("update_hashes(): Number of hashes: %s", len(hashes))
 88 |         return hashes
 89 | 
 90 |     def get_hashes_from_db_scans(self):
 91 |         self.cursor.execute("""
 92 |             SELECT distinct md5, sha1
 93 |             FROM virus_total_scans
 94 |             WHERE json IS NOT NULL AND
 95 |             query_time::date = %s
 96 |             """, (self.yesterday,))
 97 |         if self.cursor.rowcount > 0:
 98 |             hashes = set(self.cursor.fetchall())
 99 |         else:
100 |             hashes = set()
101 |         return hashes
102 | 
103 |     def get_hashes_from_db_live(self):
104 |         self.cursor.execute("""
105 |             SELECT distinct md5, sha1
106 |             FROM pe_dumps
107 |             WHERE sha1 IS NOT NULL AND
108 |             timestamp::date = %s
109 |             """, (self.yesterday,))
110 |         if self.cursor.rowcount > 0:
111 |             hashes = set(self.cursor.fetchall())
112 |         else:
113 |             hashes = set()
114 |         return hashes
115 | 
116 |     def get_hashes_from_db_manual(self):
117 |         self.logger.debug("entered get_hashes_from_db_manual()")
118 |         self.cursor.execute("""
119 |             SELECT distinct md5, sha1
120 |             FROM manual_download_checksums
121 |             WHERE referer_exists = 'f' AND
122 |             sha1 IS NOT NULL AND
123 |             timestamp::date = %s
124 |             """, (self.yesterday,))
125 |         if self.cursor.rowcount > 0:
126 |             hashes = set(self.cursor.fetchall())
127 |         else:
128 |             hashes = set()
129 |         return hashes
130 | 
131 |     def insert_scan(self, sha1, md5, response):
132 |         self.logger.debug("entered insert_scan()")
133 |         self.cursor.execute("""
134 |             INSERT INTO virus_total_submissions
135 |             (submit_time, sha1, md5, scan_id)
136 |             VALUES (LOCALTIMESTAMP, %s, %s, %s)
137 |             RETURNING vt_submit_id
138 |             """, (sha1, md5, response['scan_id']))
139 |         vt_submit_id = self.cursor.fetchone()[0]
140 |         self.cursor.execute("""
141 |             UPDATE virus_total_submissions
142 |             SET resubmit_id = %s
143 |             WHERE sha1= %s AND
144 |             submit_time::date = %s
145 |             """, (vt_submit_id, sha1, self.last_month))
146 | 
147 |     def check_report_exists(self, sha1):
148 |         self.cursor.execute("""
149 |             SELECT * FROM virus_total_scans
150 |             WHERE sha1 = %s AND
151 |             scan_time IS NOT NULL""", (sha1, ))
152 |         report_exists = True if self.cursor.rowcount else False
153 |         self.cursor.execute("""
154 |             SELECT * FROM virus_total_submissions
155 |             WHERE sha1 = %s AND
156 |             json IS NOT NULL""", (sha1, ))
157 |         report_exists = True if self.cursor.rowcount else report_exists
158 |         return report_exists
159 | 
160 |     def make_request(self, md5, sha1):
161 |         self.logger.debug("entered make_request()")
162 |         self.logger.debug("sha1: %s", sha1)
163 |         report_exists = self.check_report_exists(sha1)
164 |         self.logger.debug("report_exists: %s", report_exists)
165 |         json = None
166 |         try:
167 |             json = (vt_api.rescan_request(md5) if report_exists else
168 |                     vt_api.send_file(md5))
169 |             if json:
170 |                 response = simplejson.loads(json)
171 |                 if response["response_code"] == 1:
172 |                     self.insert_scan(sha1, md5, response)
173 |                     return True
174 |                 else:
175 |                     self.logger.warning("make_request: Bad response code: %s",
176 |                                             response["response_code"])
177 |             else:
178 |                 self.logger.warning("make_request: No JSON response")
179 |         except Exception as e:
180 |             self.logger.exception("report_exists: %s", report_exists)
181 |             self.logger.exception("json: %s", json)
182 |             self.logger.exception("sha1: %s", sha1)
183 |             self.logger.exception("make_request: Error %s", e)
184 |         return False
185 | 
186 |     def submit_hashes(self):
187 |         self.logger.debug("entered submit_hashes()")
188 |         query_count = 0
189 |         done_hashes = set()
190 |         for md5, sha1 in self.hashes:
191 |             tries = 0
192 |             # This loop makes max 3 attempts to send a scan request
193 |             while tries <= 3:
194 |                 if query_count == self.QUERY_RATE_LIMIT:
195 |                     self.logger.debug(
196 |                         "Query limit reached. Sleeping for a min.")
197 |                     time.sleep(self.ONE_MIN)
198 |                     query_count = 0
199 |                 tries += 1
200 |                 query_count += 1
201 |                 if self.make_request(md5, sha1):
202 |                     done_hashes.add((md5, sha1))
203 |                     break
204 |         if len(self.hashes):
205 |             self.logger.debug("Submitted the hashes on: %s", date.today())
206 |         self.hashes.difference_update(done_hashes)
207 | 
208 |     def update_table_with_report(self, scan_id, report, json):
209 |         self.logger.debug("entered update_table_with_report()")
210 |         scan_time = report["scan_date"]
211 |         scans = report["scans"]
212 |         num_av_labels = report["positives"]
213 |         trusted_av_labels = 0
214 |         for k, v in scans.iteritems():
215 |             if v["detected"] is True:
216 |                 if k in trusted_av_vendors:
217 |                     trusted_av_labels += 1
218 |         scan_time += " UTC"
219 |         self.cursor.execute("""
220 |             UPDATE virus_total_submissions
221 |             SET trusted_av_labels = %s,
222 |             num_av_labels = %s,
223 |             scan_time = TIMESTAMP WITH TIME ZONE %s,
224 |             json = %s
225 |             WHERE scan_id = %s and json is NULL""",
226 |             (trusted_av_labels, num_av_labels, scan_time,
227 |              json, scan_id))
228 | 
229 |     def fetch_reports(self):
230 |         self.logger.debug("entered fetch_reports()")
231 |         self.cursor.execute("""
232 |             SELECT scan_id
233 |             FROM virus_total_submissions
234 |             WHERE json is NULL and
235 |             (LOCALTIMESTAMP - submit_time) > '5 minutes' and
236 |             (LOCALTIMESTAMP - submit_time) < '3 days'
237 |             ORDER BY submit_time ASC""")
238 |         scan_ids = [row[0] for row in self.cursor.fetchall()]
239 |         self.logger.debug("fetch_reports(): %s scan reports to be fetched",
240 |                 len(scan_ids))
241 |         query_count = 0
242 |         for scan_id in scan_ids:
243 |             if query_count == self.QUERY_RATE_LIMIT:
244 |                 self.logger.debug(
245 |                     "Query limit reached. Sleeping for a min.")
246 |                 time.sleep(self.ONE_MIN)
247 |                 query_count = 0
248 |             query_count += 1
249 |             try:
250 |                 json = vt_api.get_vt_report(scan_id)
251 |                 if not json:
252 |                     self.logger.debug("No json")
253 |                     continue
254 |                 report = simplejson.loads(json)
255 |                 # Sometimes, we get the old reports wrongly
256 |                 if (report["response_code"] != 1) or (
257 |                         report['scan_id'] != scan_id):
258 |                     self.logger.debug("Response code %s for scan_id %s" %
259 |                             (report["response_code"], scan_id))
260 |                     continue
261 |                 self.update_table_with_report(scan_id, report, json)
262 |             except Exception as e:
263 |                 self.logger.exception(
264 |                   "Error in fetching report for scan_id %s: %s" % (scan_id, e))
265 |                 continue
266 | 
267 | 
268 | def sleep_for_the_day():
269 |     today = date.today()
270 |     while today == date.today():
271 |         time.sleep(15 * 60)
272 | 
273 | 
274 | def vt_submissions_func():
275 |     vt_submit = VTSubmissions()
276 |     vt_submit.get_hashes_from_db()
277 |     while True:
278 |         try:
279 |             vt_submit.submit_hashes()
280 |             vt_submit.fetch_reports()
281 |         except Exception as e:
282 |             vt_submit.logger.exception(
283 |                 "Unexpected error! %s \n Sleeping for the rest of the day", e)
284 |             sleep_for_the_day()
285 | 
286 |         vt_submit.logger.debug("main(): Sleeping for 15 min.")
287 |         time.sleep(vt_submit.ONE_MIN * 15)
288 | 
289 |         today = date.today().strftime("%Y-%m-%d")
290 |         if today != vt_submit.today:
291 |             vt_submit.today = today
292 |             vt_submit.yesterday = (date.today() -
293 |                     timedelta(days=1)).strftime("%Y-%m-%d")
294 |             vt_submit.last_month = (date.today() -
295 |                     timedelta(days=30)).strftime("%Y-%m-%d")
296 |             vt_submit.get_hashes_from_db()
297 | 
298 | 
299 | if __name__ == "__main__":
300 |     vt_submissions_func()
301 | 


--------------------------------------------------------------------------------
/file_dump/lru-cache.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *   This is an implementation of a O(1) LRU cache.
  3 |  *   Copyright (C) 2010  Roberto Perdisci (perdisci@cs.uga.edu)
  4 |  *
  5 |  *   This program is free software: you can redistribute it and/or modify
  6 |  *   it under the terms of the GNU General Public License as published by
  7 |  *   the Free Software Foundation, either version 3 of the License, or
  8 |  *   (at your option) any later version.
  9 |  *
 10 |  *   This program is distributed in the hope that it will be useful,
 11 |  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |  *   GNU General Public License for more details.
 14 |  *
 15 |  *   You should have received a copy of the GNU General Public License
 16 |  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 |  */
 18 | 
 19 | #include <time.h>
 20 | #include "lru-cache.h"
 21 | 
 22 | // #define LRUC_DEBUG
 23 | #define HT_SIZE_FACTOR 10
 24 | #define LRUC_MIN_ENTRIES 10
 25 | 
 26 | /* Initializes the Hash Table */
 27 | hash_table_t* ht_init(u_int length) {
 28 |     
 29 |     int i;    
 30 | 
 31 |     hash_table_t* ht = (hash_table_t*)malloc(sizeof(hash_table_t));
 32 |     ht->length = length * HT_SIZE_FACTOR;
 33 |     ht->vect = (ht_entry_t**)malloc(sizeof(ht_entry_t*) * ht->length);
 34 |     for(i=0; i < ht->length; i++)
 35 |         ht->vect[i] = NULL;
 36 | 
 37 |     return ht;
 38 | 
 39 | }
 40 | 
 41 | 
 42 | /* Deallocate memory for Hash Table */
 43 | void ht_destroy(hash_table_t* ht) {
 44 | 
 45 |     ht_entry_t *v;
 46 |     u_int i;
 47 | 
 48 |     if(ht == NULL)
 49 |         return;
 50 | 
 51 |     for(i=0; i < ht->length; i++) {
 52 |         v = ht->vect[i];
 53 |         while(v != NULL) {
 54 |             ht_entry_t *p = v;
 55 |             v = v->next;
 56 |             #ifdef LRUC_DEBUG 
 57 |                 printf("Destroying ht vect entry!\n");
 58 |                 fflush(stdout);
 59 |             #endif
 60 |             free(p);
 61 |         }
 62 |     }
 63 |    
 64 |     free(ht->vect);
 65 |     ht->vect = NULL;
 66 | 
 67 |     free(ht); 
 68 | 
 69 | }
 70 | 
 71 | 
 72 | void default_destroy_val_fn(void *v) {
 73 |     free(v);
 74 | }
 75 | 
 76 | 
 77 | /* Initializes the LRU cache in the special case of char* values */
 78 | lru_cache_t* lruc_init_str(u_int max_entries) {
 79 |     return lruc_init(max_entries, default_destroy_val_fn);
 80 | }
 81 | 
 82 | 
 83 | /* Initializes the LRU cache */
 84 | lru_cache_t* lruc_init(u_int max_entries, void (*destroy_val_fn)(void*)) {
 85 | 
 86 |     lru_cache_t* lruc = (lru_cache_t*)malloc(sizeof(lru_cache_t));
 87 |     lruc->ht = ht_init(max_entries);
 88 |     lruc->top = NULL;
 89 |     if(destroy_val_fn != NULL)
 90 |         lruc->destroy_val_fn = destroy_val_fn;
 91 |     else
 92 |         lruc->destroy_val_fn = default_destroy_val_fn;
 93 |     lruc->num_entries = 0;
 94 |     lruc->max_entries = LRUC_MIN_ENTRIES; // we should force at least these many entries
 95 |     if(max_entries > LRUC_MIN_ENTRIES)
 96 |         lruc->max_entries = max_entries;
 97 | 
 98 |     return lruc;
 99 | }
100 | 
101 | 
102 | /* Deallocate memory for LRU cache */
103 | void lruc_destroy(lru_cache_t *lruc) {
104 | 
105 |     if(lruc == NULL)
106 |         return;
107 | 
108 |     if(lruc->top == NULL)
109 |         return;
110 | 
111 |     if(lruc->top->prev == NULL) { // only one entry...
112 |         free(lruc->top->key);
113 |         if(lruc->top->value != NULL)
114 |             lruc->destroy_val_fn(lruc->top->value);
115 |         free(lruc->top);
116 |         return;
117 |     }
118 | 
119 |     lruc->top->prev->next = NULL; // break the circular list
120 |     while(lruc->top != NULL) {
121 |         lruc_entry_t *t = lruc->top;
122 |         lruc->top = lruc->top->next; 
123 |         free(t->key);
124 |         if(t->value != NULL) {
125 |             lruc->destroy_val_fn(t->value);
126 |             t->value = NULL;
127 |         }
128 |         free(t);
129 |     }
130 | 
131 |     ht_destroy(lruc->ht);
132 |     lruc->ht = NULL;
133 | 
134 |     free(lruc);
135 | 
136 | }
137 | 
138 | 
139 | /* Inserts an element into the Hash Table
140 |  * 'lruc_e' is a pointer to the (key,value) entry in the LRU cache
141 |  * related to the 'key' parameter
142 |  */
143 | void ht_insert(hash_table_t *ht, lruc_entry_t *lruc_e, const char *key) {
144 | 
145 |     ht_entry_t *v;
146 | 
147 |     u_int h = hash_fn(key) % ht->length;
148 |     ht_entry_t *e = (ht_entry_t*)malloc(sizeof(ht_entry_t));
149 |     e->key = key;
150 |     e->le = lruc_e;
151 |     e->next = NULL;
152 | 
153 |     v = ht->vect[h];
154 |     if(v == NULL) {
155 |         ht->vect[h] = e;
156 |         return;
157 |     }
158 | 
159 |     while(v->next != NULL)
160 |         v = v->next;
161 | 
162 |     v->next = e;
163 | 
164 | }
165 | 
166 | /* Inserts and (key,value) pair in the LRU cache.
167 |  * Notice that value could be NULL, but the key cannot be NULL
168 |  */
169 | 
170 | int lruc_insert_str(lru_cache_t *lruc, const char *key, const char* value) {
171 | 
172 |     int ret = lruc_insert(lruc, key, NULL);    
173 |     if(value!=NULL) {
174 |         lruc_entry_t *e = ht_search(lruc->ht, key);
175 |         e->value = (char*)malloc(sizeof(char)*(strlen(value)+1));
176 |         strcpy(e->value, value);
177 |     }
178 | 
179 |     return ret;
180 | 
181 | }
182 | 
183 | int lruc_insert(lru_cache_t *lruc, const char *key, void* value) {
184 | 
185 |     if(key == NULL)
186 |         return -1;
187 | 
188 |     if(lruc_search(lruc, key)!=NULL) 
189 |         return -1;
190 | 
191 |     lruc->num_entries++;
192 |     #ifdef LRUC_DEBUG
193 |     printf("Inserting %u\n", lruc->num_entries);
194 |     #endif
195 | 
196 |     lruc_entry_t *e = (lruc_entry_t*)malloc(sizeof(lruc_entry_t));
197 |     e->key = (char*)malloc(sizeof(char)*(strlen(key)+1));
198 |     strcpy(e->key, key);
199 |     e->value = value;
200 |     e->time = time(NULL);
201 | 
202 |     /* the cache is implemented as a doubly-linked circular list */
203 |     if(lruc->top == NULL) {
204 |         e->next = e;
205 |         e->prev = e;
206 |     }
207 |     else if(lruc->num_entries <= lruc->max_entries) {
208 |         e->prev = lruc->top->prev;
209 |         e->next = lruc->top;    
210 | 
211 |         lruc->top->prev->next = e;
212 |         lruc->top->prev = e;
213 |     }
214 |     else {
215 |         // printf("LRUC is full!\n");
216 |         // fflush(stdout);
217 | 
218 |         e->next = lruc->top;
219 |         e->prev = lruc->top->prev->prev;
220 |         lruc->top->prev->prev->next = e;
221 |         lruc_entry_t *tmp = lruc->top->prev;
222 |         lruc->top->prev = e;
223 | 
224 |         // evict from the cache
225 |         ht_delete(lruc->ht, tmp->key);
226 |         free(tmp->key);
227 |         if(tmp->value != NULL)
228 |             lruc->destroy_val_fn(tmp->value);
229 |         free(tmp);
230 | 
231 |         lruc->num_entries--;
232 | 
233 |         // printf("Removed LRU element; inserted the new one!\n");
234 |         // fflush(stdout);
235 |     }
236 | 
237 |     lruc->top = e;
238 | 
239 |     /* Insert e in the Hash Table for fast, O(1) searches */
240 |     ht_insert(lruc->ht, e, e->key);
241 |     
242 |     #ifdef LRUC_DEBUG 
243 |         printf("Inserted!\n", lruc->num_entries);
244 |         print_ht(lruc->ht);
245 |     #endif
246 | 
247 |     return 0;
248 | }
249 | 
250 | 
251 | /* Delete key from Hash Table */
252 | void ht_delete(hash_table_t *ht, const char *key) {
253 | 
254 |     ht_entry_t *v;
255 |     ht_entry_t *prev;
256 | 
257 |     u_int h = hash_fn(key) % ht->length;
258 |     #ifdef LRUC_DEBUG 
259 |         printf("key=%s, h=%u\n", key, h);
260 |     #endif
261 | 
262 |     v = ht->vect[h];
263 |     prev = NULL;
264 |     if(v != NULL) {
265 |         do {
266 |             if(strcmp(key, v->key) == 0) {
267 |                 if(prev != NULL)
268 |                     prev->next = v->next;
269 |                 else if(v->next != NULL)
270 |                     ht->vect[h] = v->next;
271 |                 else
272 |                     ht->vect[h] = NULL;
273 |                 free(v);
274 |                 return;
275 |             }
276 |             prev = v;    
277 |             v = v->next;
278 |         } while(v != NULL);
279 |     }
280 |     
281 |     return;
282 | 
283 | }
284 | 
285 | // Delete an entry from the LRU cache
286 | void lruc_delete(lru_cache_t *lruc, const char *key) {
287 | 
288 |     lruc_entry_t *e = ht_search(lruc->ht, key);
289 | 
290 |     if(e!=NULL) {
291 |         if(lruc->top == e && lruc->top->next == e) // only one entry!
292 |             lruc->top = NULL;
293 |         else {
294 |             if(lruc->top == e)
295 |                 lruc->top = e->next;
296 |             
297 |             e->prev->next = e->next;
298 |             e->next->prev = e->prev;
299 |         }
300 | 
301 |         ht_delete(lruc->ht, e->key);
302 |         free(e->key);
303 |         if(e->value != NULL)
304 |             lruc->destroy_val_fn(e->value);
305 |         free(e);
306 | 
307 |         lruc->num_entries--;
308 |     }
309 | 
310 | }
311 | 
312 | 
313 | /* Searches an LRU cache entry using the Hash Table */
314 | lruc_entry_t* ht_search(const hash_table_t *ht, const char *key) {
315 | 
316 |     ht_entry_t *v;
317 | 
318 |     u_int h = hash_fn(key) % ht->length;
319 |     v = ht->vect[h];
320 | 
321 |     #ifdef LRUC_DEBUG
322 |         printf("Hash Key = %u\n", h);
323 |         printf("v = %p\n", v);
324 |         fflush(stdout);
325 |     #endif
326 | 
327 |     if(v == NULL) 
328 |         return NULL;
329 | 
330 |     while(v != NULL) {
331 |         #ifdef LRUC_DEBUG
332 |             printf("v is not null!\n");
333 |             printf("key = %s\n", key);
334 |             printf("v->key = %s\n", v->key);
335 |             fflush(stdout);
336 |         #endif
337 | 
338 |         if(strcmp(key, v->key) == 0)
339 |             return v->le;
340 |         v = v->next;
341 |     }
342 | 
343 |     #ifdef LRUC_DEBUG
344 |         printf("HT entry not found! Returing NULL\n");
345 |         fflush(stdout);
346 |     #endif 
347 | 
348 |     return NULL;
349 | 
350 | }
351 | 
352 | 
353 | char* lruc_search_str(lru_cache_t *lruc, const char *key) {
354 |     return (char*)lruc_search(lruc, key);
355 | }
356 | 
357 | 
358 | void* lruc_search(lru_cache_t *lruc, const char *key) {
359 | 
360 |     lruc_entry_t *e = ht_search(lruc->ht, key);
361 | 
362 |     #ifdef LRUC_DEBUG
363 |         printf("e = %p\n", e);
364 |         fflush(stdout);
365 |     #endif
366 | 
367 |     if(e == NULL)
368 |         return NULL;
369 | 
370 |     #ifdef LRUC_DEBUG
371 |         printf("Found element in Hash Table (%s, %s)\n", e->key, e->value);
372 |         fflush(stdout);
373 |     #endif 
374 | 
375 |     
376 |     if(e != lruc->top) {
377 |             /* e is the most recently used: move it to the top, if needed! */
378 |             e->prev->next = e->next;
379 |             e->next->prev = e->prev;
380 |             e->prev = lruc->top->prev;
381 |             e->next = lruc->top;
382 |             lruc->top->prev->next = e;
383 |             lruc->top->prev = e;
384 |             lruc->top = e;   
385 |     }
386 | 
387 |     e->time = time(NULL);
388 | 
389 |     if(e->value != NULL)
390 |         return e->value;
391 |     
392 |     return e->key; // we don't want to return NULL if there is a match!
393 |                        // even if the value was NULL
394 | 
395 | }
396 | 
397 | 
398 | u_int hash_fn(const char* key) {
399 | 
400 |     #define MAX_HASH_ITER 256
401 |     return DJBHash(key, strnlen(key, MAX_HASH_ITER));
402 | 
403 | }
404 | 
405 | 
406 | /* The following hash function has been borrowed
407 |  * and slightly modified from
408 |  * http://www.partow.net/programming/hashfunctions/
409 |  * Author: Arash Partow
410 |  */
411 | u_int DJBHash(const char* str, u_int len)
412 | {
413 |    u_int hash = 5381;
414 |    u_int i    = 0;
415 | 
416 |    for(i = 0; i < len; i++)
417 |    {
418 |       hash = ((hash << 5) + hash) + (str[i]);
419 |    }
420 | 
421 |    return hash;
422 | }
423 | /***/
424 | 
425 | 
426 | void print_ht(hash_table_t *ht) {   
427 | 
428 |     ht_entry_t *v;
429 |     u_int i;
430 | 
431 |     if(ht == NULL)
432 |         return;
433 | 
434 |     for(i=0; i < ht->length; i++) {
435 |         v = ht->vect[i];
436 |         if(v != NULL) {
437 |             #ifdef LRUC_DEBUG 
438 |             printf("HASH_TAB_ENTRY: %s", v->key);
439 |             #endif
440 |             while(v->next!=NULL) {
441 |                 v = v->next;
442 |                 #ifdef LRUC_DEBUG
443 |                 printf(" | %s", v->key);
444 |                 #endif
445 |             }
446 |             #ifdef LRUC_DEBUG
447 |             printf("\n");
448 |             #endif
449 |         }
450 |     }
451 | 
452 | }
453 | 
454 | 
455 | 
456 | void clean_lruc(lru_cache_t *lruc) {
457 | 
458 |     if(lruc==NULL)
459 |         return;
460 | 
461 |     if(lruc->top == NULL)
462 |         return;    
463 | 
464 |     time_t t = time(NULL);
465 |     // printf("Current Time = %u\n", t);
466 | 
467 |     do {
468 |         lruc_entry_t *e = lruc->top->prev;
469 |         // printf("e Time = %u\n", e->time);
470 | 
471 |         if((t - e->time) > MAX_LRUC_TTL) {
472 |             if(lruc->destroy_val_fn != NULL) {
473 |                 lruc->destroy_val_fn(e->value);
474 |                 e->value = NULL;
475 |             }
476 |             lruc_delete(lruc, e->key);
477 |         }
478 |         else
479 |             break;
480 | 
481 |     } while(lruc->top!=NULL);
482 | 
483 | }
484 | 
485 | 
486 | void print_lruc(lru_cache_t *lruc) {
487 | 
488 |     if(lruc==NULL)
489 |         return;
490 | 
491 |     if(lruc->top == NULL)
492 |         return;    
493 | 
494 |     lruc_entry_t *e = lruc->top;
495 |     
496 |     do {
497 |         #ifdef LRUC_DEBUG
498 |         printf("LRU_ENTRY: (k=%s , v=%s)\n", e->key, e->value);
499 |         #endif
500 |         e = e->next;
501 |     } while(e != lruc->top);
502 | 
503 | }
504 | 
505 | 
506 | /* A little bit of testing to make sure things are working correctly... */
507 | /**
508 | int main() {
509 | 
510 |     char k[256];
511 |     char v[256];
512 |     int i;
513 | 
514 |     printf("Initializing LRU cache...\n");
515 |     lru_cache_t *lruc = lruc_init_str(10);        
516 |     fflush(stdout);
517 | 
518 | 
519 |     for(i=0; i < 10; i++) {
520 |         printf("Inserting (key,val)\n");
521 |         fflush(stdout);
522 |         sprintf(k, "key%d", (i+1));
523 |         sprintf(v, "value%d", (i+1));
524 |         lruc_insert_str(lruc, k, v);
525 |         print_ht(lruc->ht);
526 |         printf("###################\n");
527 |     }
528 | 
529 |     print_ht(lruc->ht);
530 |     print_lruc(lruc);
531 |     printf("###################\n");
532 | 
533 |     sprintf(k, "key%d", 8);
534 |     printf("Searchign for k=%s\n", k);
535 |     strcpy(v, lruc_search_str(lruc, k));
536 |     printf("Found v=%s\n", v);
537 |     printf("###################\n");
538 | 
539 |     for(i=10; i < 15; i++) {
540 |         printf("Inserting (key,val)\n");
541 |         fflush(stdout);
542 |         sprintf(k, "key%d", (i+1));
543 |         sprintf(v, "value%d", (i+1));
544 |         lruc_insert_str(lruc, k, v);
545 |         print_ht(lruc->ht);
546 |         printf("###################\n");
547 | 
548 |     }
549 | 
550 |     print_ht(lruc->ht);
551 |     print_lruc(lruc);
552 | 
553 |     for(i=6; i < 13; i++) {
554 |         sprintf(k, "key%d", i);
555 |         printf("Searchign for k=%s\n", k);
556 |         strcpy(v, lruc_search_str(lruc, k));
557 |         printf("Found v=%s\n", v);
558 |         printf("###################\n");
559 |     }
560 | 
561 |     print_ht(lruc->ht);
562 |     print_lruc(lruc);
563 |     printf("###################\n");
564 | 
565 |     for(i=16; i < 18; i++) {
566 |         printf("Inserting (key,val)\n");
567 |         fflush(stdout);
568 |         sprintf(k, "key%d", i);
569 |         sprintf(v, "value%d", i);
570 |         lruc_insert_str(lruc, k, v);
571 |         print_ht(lruc->ht);
572 |         printf("###################\n");
573 | 
574 |     }
575 | 
576 |     print_ht(lruc->ht);
577 |     print_lruc(lruc);
578 |     printf("###################\n");
579 | 
580 |     sprintf(k, "key%d", 1);
581 |     printf("Searchign for k=%s\n", k);
582 |     if(lruc_search_str(lruc, k)!=NULL) {
583 |         strcpy(v, lruc_search_str(lruc, k));
584 |         printf("Found v=%s\n", v);
585 |     }
586 |     printf("###################\n");
587 | 
588 |     lruc_destroy(lruc);
589 |     printf("Destroyed!\n");
590 | 
591 | }
592 | **/
593 | 
594 | 


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/amico_scripts/get_feature_vector.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | ###########################################################################
  3 | # Copyright (C) 2012 Phani Vadrevu and Roberto Perdisci                   #
  4 | # pvadrevu@uga.edu                                                        #
  5 | #                                                                         #
  6 | # Distributed under the GNU Public License                                #
  7 | # http://www.gnu.org/licenses/gpl.txt                                     #   
  8 | #                                                                         #
  9 | # This program is free software; you can redistribute it and/or modify    #
 10 | # it under the terms of the GNU General Public License as published by    #
 11 | # the Free Software Foundation; either version 2 of the License, or       #
 12 | # (at your option) any later version.                                     #
 13 | #                                                                         #
 14 | ###########################################################################
 15 | import time
 16 | import urlparse
 17 | import re
 18 | import psycopg2
 19 | import util
 20 | import sys
 21 | import numpy as np
 22 | import pandas as ps
 23 | from datetime import timedelta
 24 | from config import MAX_PAST_DUMPS, MAX_PAST_DAYS
 25 | 
 26 | 
 27 | 
 28 | 
 29 | # TODO: Don't let the hash_life_time and num_dumps_with_same_hash values be null
 30 | # TODO: Speed up the script
 31 | # TODO: Verify how null values of x_malware_ratio features are handled by WEKA
 32 | 
 33 | 
 34 | 
 35 | def compute_features_hts(df):
 36 |     """ Compute host/2ld/server-based features, depending on df """
 37 | 
 38 |     # TODO: Move these thresholds to config file?
 39 |     FEAT_TAVS_THRESHOLD = 1
 40 |     FEAT_NAVS_THRESHOLD = 0
 41 |     
 42 |     td = len(set(df['dump_id']))
 43 |     bd = len(set(df[df.navs==0]['dump_id']))
 44 |     md = len(set(df[df.tavs>FEAT_TAVS_THRESHOLD]['dump_id']))
 45 |     sd = len(set(df[df.navs>FEAT_NAVS_THRESHOLD]['dump_id']))
 46 |     th = len(set(df['sha1']))
 47 |     uh = len(set(df[df.navs.isnull()]['sha1']))
 48 |     
 49 |     br = mr = sr = ur = None
 50 |     if td > 0:
 51 |         br = float(bd)/td
 52 |         mr = float(md)/td
 53 |         sr = float(sd)/td
 54 |     if th > 0 :
 55 |         ur = float(uh)/th
 56 |     
 57 |     avg_navs = None
 58 |     avg_tavs = None
 59 |     
 60 |     sha1_nav_labels = []
 61 |     sha1_tav_labels = []
 62 |     for (sha,dfgroup) in df.groupby('sha1'):
 63 |         # sort to get only the most recent number of av labels for a given sha1
 64 |         d = dfgroup.sort_values('dump_id',ascending=False)
 65 |         if d['navs'].iat[0] != None and not np.isnan(d['navs'].iat[0]):
 66 |             sha1_nav_labels.append(d['navs'].iat[0])
 67 |             sha1_tav_labels.append(d['tavs'].iat[0])
 68 |     if(len(sha1_nav_labels) > 0):
 69 |         # average over different sha1s
 70 |         avg_navs = np.mean(sha1_nav_labels)
 71 |         avg_tavs = np.mean(sha1_tav_labels)
 72 | 
 73 |     return { 'total_downloads':td,'benign_downloads':bd,'malware_downloads':md,'suspicious_downloads':sd,
 74 |              'total_hashes':th,'unknown_hashes':uh,
 75 |              'benign_ratio':br,'malware_ratio':mr,'suspicious_ratio':sr,'unknown_hash_ratio':ur,
 76 |              'avg_av_labels':avg_navs,'avg_trusted_labels':avg_tavs }
 77 | 
 78 | 
 79 | def insert_hts_based_features(cursor, dump_id):
 80 |     """ Computes host/2ld/server-based features for a given download
 81 |     
 82 |     Arguments:
 83 |         cursor: DB cursort from existing DB connection
 84 |         dump_id: id of download to be classified 
 85 | 
 86 |     """
 87 | 
 88 |     # also query for timestamp, so we can use to limit how much we go back in time!
 89 |     query = " SELECT host,server,DATE(timestamp) FROM pe_dumps WHERE dump_id = %s "
 90 | 
 91 |     cursor.execute(query,(dump_id, ))
 92 |     row = cursor.fetchone()
 93 |     if not row:
 94 |         return
 95 | 
 96 |     (host,server,date) = row
 97 |     domain = util.reorder_domain(host)
 98 |     twold = util.reorder_domain(util.extract_twold(domain))
 99 |     twold_like = '-NONE-' # avoids any matching in "pe.host LIKE %s" in the query below 
100 |     if twold is None:
101 |         if not host is None:
102 |             twold = host
103 |             twold_like = twold + '.%'
104 | 
105 |     query = """
106 |         SELECT dump_id,pe.sha1,pe.host,pe.server,trusted_av_labels,num_av_labels
107 |         FROM pe_dumps AS pe 
108 |         JOIN ped_vts_mapping AS pvm 
109 |           USING(dump_id)
110 |         JOIN virus_total_scans AS vts
111 |           USING(vt_id)
112 |         WHERE pe.corrupt = 'f' AND
113 |              (pe.host = %s OR pe.host LIKE %s OR pe.server = %s) AND
114 |               pe.dump_id < %s AND pe.dump_id > %s AND
115 |               pe.timestamp > %s """
116 |     
117 |     cursor.execute(query,(host, twold_like, server, dump_id, dump_id-MAX_PAST_DUMPS, date-timedelta(days=MAX_PAST_DAYS)))
118 |     tuples = cursor.fetchall()
119 | 
120 |     # make the results into a pandas data frame
121 |     if not tuples:
122 |         df = ps.DataFrame(index=[], columns=['dump_id','sha1','host','server','tavs','navs'])
123 |     else:    
124 |         df = ps.DataFrame.from_records(tuples)
125 |         df.columns = ['dump_id','sha1','host','server','tavs','navs']
126 |     
127 |     ### compute twold-based features
128 |     df_twold = df[df['host'].str.startswith(twold)==True]
129 |     twold_v = compute_features_hts(df_twold)
130 | 
131 |     ### compute host-based features
132 |     df_host = df[df.host == host]
133 |     host_v = compute_features_hts(df_host)
134 |     
135 |     ### compute server-based features
136 |     df_server = df[df.server == server]
137 |     server_v = compute_features_hts(df_server)
138 |     
139 |     twold_features = (twold_v['benign_downloads'],twold_v['malware_downloads'], twold_v['suspicious_downloads'],
140 |                       twold_v['total_downloads'], twold_v['malware_ratio'], twold_v['suspicious_ratio'], twold_v['benign_ratio'],
141 |                       twold_v['avg_av_labels'], twold_v['avg_trusted_labels'],
142 |                       twold_v['unknown_hashes'], twold_v['total_hashes'], twold_v['unknown_hash_ratio'])
143 |     
144 |     host_features = (host_v['benign_downloads'],host_v['malware_downloads'], host_v['suspicious_downloads'],
145 |                      host_v['total_downloads'], host_v['malware_ratio'], host_v['suspicious_ratio'], host_v['benign_ratio'],
146 |                      host_v['avg_av_labels'], host_v['avg_trusted_labels'],
147 |                      host_v['unknown_hashes'], host_v['total_hashes'], host_v['unknown_hash_ratio'])
148 |     
149 |     server_features = (server_v['benign_downloads'],server_v['malware_downloads'], server_v['suspicious_downloads'],
150 |                        server_v['total_downloads'], server_v['malware_ratio'], server_v['suspicious_ratio'], server_v['benign_ratio'],
151 |                        server_v['avg_av_labels'], server_v['avg_trusted_labels'],
152 |                        server_v['unknown_hashes'], server_v['total_hashes'], server_v['unknown_hash_ratio'])
153 | 
154 | 
155 |     query = """ UPDATE weka_features SET
156 |         
157 |         twold_benign_downloads = %s,
158 |         twold_malware_downloads = %s,
159 |         twold_suspicious_downloads = %s,
160 |         twold_total_downloads = %s,
161 |         twold_malware_ratio = %s,
162 |         twold_suspicious_ratio = %s,
163 |         twold_benign_ratio = %s,
164 |         twold_avg_av_labels = %s,
165 |         twold_avg_trusted_labels = %s,
166 |         twold_unknown_hashes = %s,
167 |         twold_total_hashes = %s,
168 |         twold_unknown_hash_ratio = %s,
169 |         
170 |         host_benign_downloads = %s,
171 |         host_malware_downloads = %s,
172 |         host_suspicious_downloads = %s,
173 |         host_total_downloads = %s,
174 |         host_malware_ratio = %s,
175 |         host_suspicious_ratio = %s,
176 |         host_benign_ratio = %s,
177 |         host_avg_av_labels = %s,
178 |         host_avg_trusted_labels = %s,
179 |         host_unknown_hashes = %s,
180 |         host_total_hashes = %s,
181 |         host_unknown_hash_ratio = %s,
182 |         
183 |         server_ip_benign_downloads = %s,
184 |         server_ip_malware_downloads = %s,
185 |         server_ip_suspicious_downloads = %s,
186 |         server_ip_total_downloads = %s,
187 |         server_ip_malware_ratio = %s,
188 |         server_ip_suspicious_ratio = %s,
189 |         server_ip_benign_ratio = %s,
190 |         server_ip_avg_av_labels = %s,
191 |         server_ip_avg_trusted_labels = %s,
192 |         server_ip_unknown_hashes = %s,
193 |         server_ip_total_hashes = %s,
194 |         server_ip_unknown_hash_ratio = %s
195 |         
196 |         where dump_id = %s """
197 | 
198 |     try:
199 |         cursor.execute(query, twold_features + host_features + server_features + (dump_id,))
200 |     except Exception as e:
201 |         print e
202 |         print "Could not insert server-based features for the dump #", dump_id
203 | 
204 | 
205 | 
206 | def insert_bgp_based_features(cursor, dump_id):
207 | 
208 |     cursor.execute("""
209 |             SELECT server from pe_dumps where dump_id = %s""", (dump_id, ))
210 |     server = cursor.fetchone()[0]
211 | 
212 |     cursor.execute("""
213 |                     select bgp_prefix from bgp2asn where bgp_prefix >> %s""", (server,))
214 |     row = cursor.fetchone()
215 |     if row is not None:
216 |         bgp_prefix = row[0]
217 |     else:
218 |         return
219 | 
220 |     cursor.execute("""
221 |         SELECT COUNT(DISTINCT dump_id)
222 |         FROM pe_dumps AS pe
223 |         WHERE pe.server << %s AND
224 |             pe.corrupt = 'f' AND
225 |             pe.dump_id < %s AND pe.dump_id > %s """,
226 |         (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS))
227 |     bgp_total_downloads = cursor.fetchone()[0]
228 | 
229 |     # Disabled vt_month_shelf due to the 403 error from VT
230 |     #cursor.execute("""
231 |     #    SELECT count(distinct dump_id) from pe_dumps as pe JOIN 
232 |     #    weka_features as f using (dump_id)
233 |     #    where f.raw_dump_num_av_labels = 0 and f.vt_month_shelf = 't' and  
234 |     #    pe.server << %s and pe.dump_id < %s """,
235 |     #    (bgp_prefix, dump_id))
236 |     cursor.execute("""
237 |         SELECT COUNT(DISTINCT dump_id)
238 |         FROM pe_dumps AS pe JOIN
239 |             ped_vts_mapping AS pvm USING (dump_id),
240 |             virus_total_scans AS vts
241 |         WHERE vts.num_av_labels = 0 AND
242 |             pe.corrupt = 'f' AND
243 |             pe.server << %s AND
244 |             pe.dump_id < %s AND pe.dump_id > %s AND 
245 |             vts.vt_id = pvm.vt_id""",
246 |         (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS))
247 |     bgp_benign_downloads = cursor.fetchone()[0]
248 | 
249 |     cursor.execute("""
250 |         SELECT COUNT(DISTINCT dump_id)
251 |         FROM pe_dumps AS pe JOIN
252 |             ped_vts_mapping AS pvm USING (dump_id),
253 |             virus_total_scans AS vts
254 |         WHERE vts.trusted_av_labels > 1 AND
255 |             pe.corrupt = 'f' AND
256 |             pe.server << %s AND
257 |             pe.dump_id < %s AND pe.dump_id > %s AND
258 |             vts.vt_id = pvm.vt_id""",
259 |         (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS))
260 |     bgp_malware_downloads = cursor.fetchone()[0]
261 | 
262 |     cursor.execute("""
263 |         SELECT COUNT(DISTINCT dump_id)
264 |         FROM pe_dumps AS pe JOIN
265 |             ped_vts_mapping AS pvm USING (dump_id),
266 |             virus_total_scans AS vts
267 |         WHERE vts.num_av_labels > 1 AND
268 |             pe.corrupt = 'f' AND
269 |             pe.server << %s AND
270 |             pe.dump_id < %s AND pe.dump_id > %s AND 
271 |             vts.vt_id = pvm.vt_id""",
272 |         (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS))
273 |     bgp_suspicious_downloads = cursor.fetchone()[0]
274 | 
275 |     if bgp_total_downloads == 0:
276 |         bgp_benign_ratio = None
277 |         bgp_malware_ratio = None
278 |         bgp_suspicious_ratio = None
279 |     else:
280 |         bgp_benign_ratio = float(bgp_benign_downloads) / bgp_total_downloads
281 |         bgp_malware_ratio = float(bgp_malware_downloads) / bgp_total_downloads
282 |         bgp_suspicious_ratio = float(bgp_suspicious_downloads) / bgp_total_downloads
283 | 
284 |     # The averages are over distinct sha1s
285 |     cursor.execute("""
286 |         SELECT AVG(num_av_labels), AVG(trusted_av_labels)
287 |         FROM
288 |             (SELECT pe.sha1, MAX(dump_id) AS max_id
289 |             FROM pe_dumps AS pe
290 |             WHERE pe.server << %s AND
291 |                 pe.dump_id < %s AND pe.dump_id > %s AND
292 |                 pe.corrupt = 'f' GROUP BY pe.sha1) as a
293 |             JOIN
294 |             (SELECT p.sha1, num_av_labels, trusted_av_labels, dump_id
295 |             FROM pe_dumps AS p JOIN
296 |                 ped_vts_mapping as pvm USING (dump_id),
297 |                 virus_total_scans as vts
298 |             WHERE pvm.vt_id = vts.vt_id AND
299 |                 p.server << %s AND
300 |                 dump_id < %s AND dump_id > %s AND
301 |                 p.corrupt='f') as b
302 |             ON a.max_id = b.dump_id
303 |         WHERE num_av_labels IS NOT NULL""",
304 |     (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS, bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS))
305 |     if cursor.rowcount == 0:
306 |         bgp_avg_av_labels = None
307 |         bgp_avg_trusted_labels = None
308 |     else:
309 |         bgp_avg_av_labels, bgp_avg_trusted_labels = cursor.fetchone()
310 | 
311 |     # the oldest scan report is used to get the # of unknown hashes
312 |     # to remove any bias due to VT submissions
313 |     cursor.execute("""
314 |         SELECT COUNT(DISTINCT b.sha1)
315 |         FROM
316 |             (SELECT pe.sha1, MIN(dump_id) AS min_id
317 |             FROM pe_dumps AS pe
318 |             WHERE pe.server << %s AND
319 |                 pe.dump_id < %s AND pe.dump_id > %s AND
320 |                 pe.corrupt = 'f' GROUP BY pe.sha1) as a
321 |             JOIN
322 |             (SELECT p.sha1, num_av_labels, trusted_av_labels, dump_id
323 |             FROM pe_dumps AS p JOIN
324 |                 ped_vts_mapping as pvm USING (dump_id),
325 |                 virus_total_scans as vts
326 |             WHERE pvm.vt_id = vts.vt_id AND
327 |                 p.server << %s AND
328 |                 dump_id < %s AND dump_id > %s AND
329 |                 p.corrupt='f') as b
330 |             ON a.min_id = b.dump_id
331 |         WHERE num_av_labels IS NULL""",
332 |     (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS, bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS))
333 |     bgp_unknown_hashes = cursor.fetchone()[0]
334 |     
335 |     cursor.execute("""
336 |         SELECT COUNT(DISTINCT pe.sha1)
337 |         FROM pe_dumps AS pe
338 |         WHERE pe.server << %s AND
339 |             pe.corrupt = 'f' AND
340 |             pe.dump_id < %s AND pe.dump_id > %s """,
341 |     (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS))
342 |     bgp_total_hashes = cursor.fetchone()[0]
343 |     if bgp_total_hashes != 0:
344 |         bgp_unknown_hash_ratio = float(bgp_unknown_hashes) / bgp_total_hashes
345 |     else:
346 |         bgp_unknown_hash_ratio = None
347 |     try:
348 |         cursor.execute("""
349 |                 UPDATE weka_features set bgp_benign_downloads = %s,
350 |                  bgp_malware_downloads = %s,
351 |                  bgp_suspicious_downloads = %s,
352 |                  bgp_total_downloads = %s,
353 |                  bgp_malware_ratio = %s,
354 |                  bgp_suspicious_ratio = %s,
355 |                  bgp_benign_ratio = %s,
356 |                  bgp_avg_av_labels = %s,
357 |                  bgp_avg_trusted_labels = %s,
358 |                  bgp_unknown_hashes = %s,
359 |                  bgp_total_hashes = %s,
360 |                  bgp_unknown_hash_ratio = %s
361 |                  where dump_id = %s """,
362 |                 (bgp_benign_downloads, bgp_malware_downloads, 
363 |                  bgp_suspicious_downloads,
364 |                  bgp_total_downloads, bgp_malware_ratio,
365 |                  bgp_suspicious_ratio,
366 |                  bgp_benign_ratio, 
367 |                  bgp_avg_av_labels, bgp_avg_trusted_labels, 
368 |                  bgp_unknown_hashes, bgp_total_hashes,
369 |                  bgp_unknown_hash_ratio, dump_id))
370 |     except:
371 |         print "Could not insert bgp based features for the dump #", dump_id
372 | 
373 | 
374 | def insert_hash_based_features(cursor, dump_id):
375 |     cursor.execute("""select sha1 from pe_dumps where dump_id = %s""",
376 |                    (dump_id, ))
377 |     sha1 = cursor.fetchone()[0]
378 |     if sha1 is None:
379 |         return
380 |     cursor.execute("""
381 |         SELECT EXTRACT(EPOCH FROM (MAX(timestamp) - MIN(timestamp))),
382 |             COUNT(DISTINCT pe.dump_id)
383 |         FROM pe_dumps AS pe
384 |         WHERE pe.dump_id < %s AND pe.dump_id > %s AND
385 |             pe.corrupt = 'f' AND
386 |             pe.sha1 = %s AND
387 |             pe.corrupt = 'f' """,
388 |         (dump_id, dump_id-MAX_PAST_DUMPS, sha1))
389 |     hash_life_time, num_dumps_with_same_hash = cursor.fetchone()
390 | 
391 |     if hash_life_time is None:
392 |         hash_life_time = 0
393 |     if num_dumps_with_same_hash is None:
394 |         num_dumps_with_same_hash = 0
395 | 
396 |     cursor.execute("""
397 |         UPDATE weka_features
398 |         SET hash_life_time = %s,
399 |             num_dumps_with_same_hash = %s
400 |         WHERE dump_id = %s""",
401 |         (hash_life_time, num_dumps_with_same_hash, dump_id))
402 | 
403 |     cursor.execute("""
404 |         SELECT count(*) FROM
405 |             (SELECT DISTINCT client,
406 |                 DATE_TRUNC('DAY', timestamp)
407 |             FROM pe_dumps AS pe
408 |             WHERE pe.dump_id < %s AND pe.dump_id > %s AND
409 |                 pe.corrupt='f' AND
410 |                 pe.sha1 = %s) AS a""",
411 |         (dump_id, dump_id-MAX_PAST_DUMPS, sha1))
412 |     estimated_clients_with_same_hash = cursor.fetchone()[0]
413 | 
414 |     cursor.execute("""
415 |         SELECT AVG(count)
416 |         FROM
417 |             (SELECT client,
418 |                 date_trunc('day', timestamp) AS ts,
419 |                 COUNT(*)
420 |             FROM pe_dumps AS pe
421 |             WHERE pe.dump_id < %s AND pe.dump_id > %s AND
422 |                 pe.corrupt='f' AND
423 |                 pe.sha1 = %s
424 |             GROUP BY client, ts) AS b""",
425 |         (dump_id, dump_id-MAX_PAST_DUMPS, sha1))
426 |     hash_daily_dump_rate_per_client = cursor.fetchone()[0]
427 | 
428 |     cursor.execute("""
429 |         UPDATE weka_features
430 |         SET estimated_clients_with_same_hash = %s,
431 |             hash_daily_dump_rate_per_client = %s
432 |         WHERE dump_id = %s""",
433 |         (estimated_clients_with_same_hash, hash_daily_dump_rate_per_client,
434 |         dump_id))
435 | 
436 | 
437 | def insert_download_request_features(cursor, dump_id):
438 |     cursor.execute("""
439 |         SELECT *
440 |         FROM pe_dumps
441 |         WHERE dump_id = %s AND
442 |             referer IS null""",
443 |         (dump_id,))
444 |     if cursor.rowcount == 1:
445 |         referer_exists = 0
446 |     else:
447 |         referer_exists = 1
448 | 
449 |     # update weka_features as wf set host_name_exists=0 from pe_dumps as pe
450 |     # where pe.dump_id = wf.dump_id and host SIMILAR TO
451 |     # '[0-9]+.[0-9]+.[0-9]+.[0-9]+'
452 |     cursor.execute("""
453 |         SELECT *
454 |         FROM pe_dumps
455 |         WHERE dump_id = %s AND
456 |             host = SUBSTRING(CAST(server AS TEXT) FROM '(.*)/32')""",
457 |         (dump_id,))
458 |     if cursor.rowcount == 0:
459 |         host_name_exists = 1
460 |     else:
461 |         host_name_exists = 0
462 | 
463 |     cursor.execute("""
464 |         UPDATE weka_features
465 |         SET referer_exists = %s,
466 |             host_name_exists = %s
467 |         WHERE dump_id = %s""",
468 |         (referer_exists, host_name_exists, dump_id))
469 | 
470 |     # Once we generalize to file types beyond PE files, the extension_class feature should probably be removed
471 |     common_ext = ['exe', 'dll', 'msi', 'jar', 'dmg', 'apk'] # executable files extensions...
472 |     common_fake = ['html', 'gif', 'jpg', 'jpeg', 'txt', 'pdf', 'htm']
473 |     other_ext = ['php', 'aspx', 'asp']
474 | 
475 |     cursor.execute("""
476 |         SELECT url
477 |         FROM pe_dumps
478 |         WHERE dump_id = %s""",
479 |         (dump_id,))
480 |     url = cursor.fetchone()[0]
481 |     if url is not None:
482 |         ext = util.extract_extension(url)
483 |         if ext is not None:
484 |             ext = ext[:10]
485 | 
486 |         if ext is None:
487 |             ext_class = "no_ext"
488 |         elif ext in common_ext:
489 |             ext_class = "common_ext"
490 |         elif ext in common_fake:
491 |             ext_class = "common_fake"
492 |         elif ext in other_ext:
493 |             ext_class = "other_ext"
494 |         else:
495 |             ext_class = "unknown_ext"
496 |         #print "url:", url
497 |         #print "extension:", ext
498 |     else:
499 |         ext_class = "no_url"
500 |         ext = None
501 |     cursor.execute("""
502 |         UPDATE weka_features
503 |         SET extension_class = %s
504 |         WHERE dump_id = %s""",
505 |         (ext_class, dump_id))
506 | 
507 |     cursor.execute("""
508 |         SELECT CHAR_LENGTH(url), url
509 |         FROM pe_dumps
510 |         WHERE dump_id = %s""",
511 |         (dump_id,))
512 |     row = cursor.fetchone()
513 |     url_length = None 
514 |     if row is not None:
515 |         url_length = row[0]
516 |         url = row[1]
517 |         if url is not None:
518 |             url_path = url.split('?')[0]
519 |             directory_depth = url_path.count('/')
520 |         else:
521 |             url_length = 0
522 |             directory_depth = 0
523 |     
524 |     cursor.execute("""
525 |             UPDATE weka_features SET
526 |             url_length = %s,
527 |             directory_depth = %s
528 |             WHERE dump_id = %s""",
529 |             (url_length, directory_depth, dump_id))
530 | 
531 | 
532 | def insert_url_features(cursor, dump_id):
533 | #    cursor.execute("SELECT ")
534 |     cursor.execute("SELECT url from pe_dumps where dump_id = %s", (dump_id,))
535 |     url = cursor.fetchone()[0]
536 |     cursor.execute("""
537 |         SELECT COUNT(DISTINCT dump_id)
538 |         FROM pe_dumps AS pe JOIN
539 |             ped_vts_mapping AS pvm USING (dump_id),
540 |             virus_total_scans AS vts
541 |         WHERE vts.trusted_av_labels > 1 AND
542 |             pe.corrupt = 'f' AND
543 |             pe.url = %s AND
544 |             pe.dump_id < %s AND pe.dump_id > %s AND
545 |             pvm.vt_id = vts.vt_id """,
546 |         (url, dump_id, dump_id-MAX_PAST_DUMPS))
547 |     url_malware_downloads = cursor.fetchone()[0]
548 | 
549 |     cursor.execute("""
550 |         SELECT COUNT(DISTINCT dump_id)
551 |         FROM pe_dumps AS pe
552 |         WHERE pe.url = %s AND
553 |             pe.corrupt = 'f' AND
554 |             pe.dump_id < %s AND pe.dump_id > %s """,
555 |         (url, dump_id, dump_id-MAX_PAST_DUMPS))
556 |     url_total_downloads = cursor.fetchone()[0]
557 | 
558 |     cursor.execute("""
559 |         SELECT COUNT(DISTINCT pe.sha1)
560 |         FROM pe_dumps AS pe
561 |         WHERE pe.url = %s AND
562 |             pe.corrupt = 'f' AND
563 |             pe.dump_id < %s AND pe.dump_id > %s AND
564 |             pe.corrupt='f' """,
565 |         (url, dump_id, dump_id-MAX_PAST_DUMPS))
566 |     url_distinct_sha1s = cursor.fetchone()[0]
567 | 
568 |     cursor.execute("""
569 |         UPDATE weka_features
570 |         SET url_malware_downloads = %s,
571 |             url_total_downloads = %s,
572 |             url_distinct_sha1s = %s
573 |         WHERE dump_id = %s """,
574 |     (url_malware_downloads, url_total_downloads,
575 |     url_distinct_sha1s, dump_id))
576 | 
577 | 
578 | def get_url_struct_matches(cursor, url_struct, dump_id):
579 |     # escaping special regex characters
580 |     replace = [
581 |                ('.', '\.'), ('+', '\+'), ('?', '\?'),
582 |                ('{', '\{'), ('}', '\}'), ('[', '\]'),
583 |                ('[', '\]'), ('^', '\^'), ('$', '\$')
584 |               ]
585 |     for pair in replace:
586 |         url_struct = url_struct.replace(pair[0], pair[1])
587 |     # the structure should be a matched to the whole query path
588 |     url_struct = '^.*\?' + url_struct + '$'
589 |     #print "The formatted url_struct: %s" % (url_struct,)
590 |     cursor.execute("""
591 |         SELECT COUNT(DISTINCT dump_id)
592 |         FROM pe_dumps AS pe JOIN
593 |             ped_vts_mapping AS pvm USING (dump_id),
594 |             virus_total_scans AS vts
595 |         WHERE vts.trusted_av_labels > 1 AND
596 |             pvm.vt_id = vts.vt_id AND
597 |             pe.corrupt = 'f' AND
598 |             pe.url ~ %s AND
599 |             pe.dump_id < %s AND pe.dump_id > %s """,
600 |         (url_struct, dump_id, dump_id-MAX_PAST_DUMPS))
601 |     url_struct_malware_downloads = cursor.fetchone()[0]
602 | 
603 |     cursor.execute("""
604 |         SELECT COUNT(DISTINCT dump_id)
605 |         FROM pe_dumps AS pe
606 |         WHERE pe.url ~ %s AND
607 |             pe.corrupt = 'f' AND
608 |             pe.dump_id < %s AND pe.dump_id > %s """,
609 |         (url_struct, dump_id, dump_id-MAX_PAST_DUMPS))
610 |     url_struct_total_downloads = cursor.fetchone()[0]
611 | 
612 |     cursor.execute("""
613 |         SELECT COUNT(DISTINCT pe.sha1)
614 |         FROM pe_dumps AS pe
615 |         WHERE pe.url ~ %s AND
616 |             pe.dump_id < %s AND pe.dump_id > %s AND
617 |             pe .corrupt='f' """,
618 |         (url_struct, dump_id, dump_id-MAX_PAST_DUMPS))
619 |     url_struct_distinct_sha1s = cursor.fetchone()[0]
620 |     return (url_struct_malware_downloads, url_struct_total_downloads,
621 |             url_struct_distinct_sha1s)
622 | 
623 | 
624 | def insert_url_struct_matches(cursor, pmd, ptd, pds, dump_id):
625 |     sql_query = "UPDATE weka_features " \
626 |                 "SET url_struct_malware_downloads = %s, " \
627 |                 "url_struct_total_downloads = %s, " \
628 |                 "url_struct_distinct_sha1s = %s " \
629 |                 "WHERE dump_id = %s" % \
630 |                 (pmd, ptd, pds, dump_id)
631 |     cursor.execute(sql_query)
632 | 
633 | 
634 | def insert_url_struct_features(cursor, dump_id):
635 |     cursor.execute("""
636 |                 SELECT url from pe_dumps where dump_id = %s""", (dump_id,))
637 |     url = cursor.fetchone()
638 |     if url is None:
639 |         return
640 | 
641 |     url = url[0]
642 |     if url is None:
643 |         return
644 |     #print "The url is: ", url
645 |     #print "Dump_id is ", dump_id
646 |     #print "The parsed result is:", urlparse.urlparse(url)
647 |     parsed_url = urlparse.urlparse(url)
648 |     path = parsed_url.path
649 |     #print "Path: ", path
650 |     query = parsed_url.query
651 |     query_list = urlparse.parse_qsl(query, keep_blank_values=True)
652 |     #print "The parsed query is:",query_list
653 | 
654 |     #print "Query is: %s" % query
655 |     m = re.search('([^\w]*)([\w]+)([^\w]+)(.*)', query)
656 |     if m is None:
657 |         print "No url_struct found!"
658 |         return
659 |     first_exp = m.group(1)
660 |     word = m.group(2)
661 |     divide = m.group(3)
662 |     rest = m.group(4)
663 |     url_struct = None
664 |     if first_exp is not None:
665 |         url_struct = first_exp
666 |     if rest is not None:
667 |         url_struct += "\w*" + divide
668 |     while True:
669 |         m = re.search('([\w]+)([^\w]+)?(.*)', rest)
670 |         if m is not None:
671 |             word = m.group(1)
672 |             divide = m.group(2)
673 |             #if '.' in divide:
674 |             #print "divide:", divide
675 |             rest = m.group(3)
676 |             if divide:
677 |                 url_struct += "\w*" + divide
678 |             else:
679 |                 url_struct += "\w*"
680 |         else: break
681 | 
682 |     #print "url_struct :", url_struct
683 |     if len(url_struct) < 10:
684 |         print "url_struct pattern length too short:%s, " % len(url_struct), url_struct
685 |         return
686 | 
687 |     pmd, ptd, pds = get_url_struct_matches(cursor, url_struct, dump_id)
688 |     print "Number of url_struct matching dumps: %s/%s" % (pmd,ptd)
689 |     insert_url_struct_matches(cursor, pmd, ptd, pds, dump_id)
690 | 
691 | 
692 | # TODO: currently file_type is not used. 
693 | #       We will need to see if some of the features should be modified
694 | #       to take the file_type into account
695 | def insert_features(cursor, dump_id, file_type):
696 |     print "the dump_id is:", dump_id
697 |     cursor.execute("""
698 |         DELETE FROM weka_features
699 |         WHERE dump_id = %s
700 |         """, (dump_id,))
701 |     cursor.execute("""
702 |     INSERT INTO weka_features (dump_id, corrupt, sha1, host)
703 |         (SELECT pe.dump_id, pe.corrupt, pe.sha1, pe.host
704 |             FROM pe_dumps AS pe
705 |             WHERE pe.dump_id = %s )""",
706 |         (dump_id,))
707 |     #print "Inserted dump_id", cursor.fetchone()[0]
708 | 
709 |     insert_hts_based_features(cursor, dump_id)
710 |     
711 |     insert_bgp_based_features(cursor, dump_id)
712 |     insert_hash_based_features(cursor, dump_id)
713 |     insert_download_request_features(cursor, dump_id)
714 |     insert_url_features(cursor, dump_id)
715 |     try:
716 |         insert_url_struct_features(cursor, dump_id)
717 |     except psycopg2.DataError as e:
718 |         print "Exception in inserting url_struct features for %s dump_id" % (dump_id,)
719 |         print e
720 | 
721 | 
722 | def get_feature_vector(dump_id, file_type):
723 |     #print "entered get_feature_vector"
724 |     conn = util.connect_to_db()
725 |     cursor = conn.cursor()
726 |     insert_features(cursor, dump_id, file_type)
727 |     print "Done inserting features for dump_id: ", dump_id
728 | 
729 | if __name__ == "__main__":
730 |     if len(sys.argv) == 3:
731 |         get_feature_vector(int(sys.argv[1]),sys.argv[2])
732 |     else:
733 |         print "Incorrect number of arguments!!"
734 | 


--------------------------------------------------------------------------------