├── .gitignore ├── amico_scripts ├── logs │ └── README ├── .gitignore ├── manual_downloads │ └── README ├── weka.jar ├── models │ └── default.model ├── parsed │ ├── captured_files │ │ └── README │ └── raw_files │ │ └── README ├── stop_amico.sh ├── train_config.py ├── logging.conf ├── db_cleanup.py ├── postfile.py ├── etld.py ├── pe_extract.py ├── features.py ├── db_syslog.py ├── config.py.tmpl ├── update_urls_fix.py ├── vt_api.py ├── classify_dump.py ├── util.py ├── fe_db_setup.py ├── db_pe_dumps.py ├── manual_download.py ├── db_file_dumps.py ├── ip2asn.py ├── db_setup.py ├── db_virus_total.py ├── extract_file.py ├── start_amico.py ├── trainer.py ├── vt_submit.py └── get_feature_vector.py ├── file_dump ├── .gitignore ├── util │ ├── README │ ├── send_SIGTERM.sh │ ├── send_SIGUSR2.sh │ ├── send_SIGUSR1.sh │ ├── start.sh │ ├── set_cpu_affinity.sh │ ├── turn_offload_off.sh │ └── set_nic_irq_smp_affinity.sh ├── mac_strnlen.c ├── Makefile.valgrind ├── Makefile ├── start_file_dump.py ├── config.py.tmpl ├── README ├── seq_list.h ├── lru-cache.h ├── search.c ├── seq_list.c └── lru-cache.c ├── README.md └── license.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.log 3 | -------------------------------------------------------------------------------- /amico_scripts/logs/README: -------------------------------------------------------------------------------- 1 | Amico error and debug logs directory 2 | -------------------------------------------------------------------------------- /amico_scripts/.gitignore: -------------------------------------------------------------------------------- 1 | config.py 2 | parsed/ 3 | manual_downloads/ 4 | -------------------------------------------------------------------------------- /amico_scripts/manual_downloads/README: -------------------------------------------------------------------------------- 1 | Stores "manually" re-downloaded files 2 | -------------------------------------------------------------------------------- /file_dump/.gitignore: -------------------------------------------------------------------------------- 1 | config.py 2 | file_dump 3 | *.o 4 | *.log 5 | *.log.* 6 | dumps/ 7 | -------------------------------------------------------------------------------- /amico_scripts/weka.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/perdisci/amico/HEAD/amico_scripts/weka.jar -------------------------------------------------------------------------------- /amico_scripts/models/default.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/perdisci/amico/HEAD/amico_scripts/models/default.model -------------------------------------------------------------------------------- /amico_scripts/parsed/captured_files/README: -------------------------------------------------------------------------------- 1 | This directory contains all captured files (extracted from TCP flows) 2 | -------------------------------------------------------------------------------- /file_dump/util/README: -------------------------------------------------------------------------------- 1 | These scripts will need to be slightly edited to adapt them to your own system setup and needs 2 | -------------------------------------------------------------------------------- /amico_scripts/parsed/raw_files/README: -------------------------------------------------------------------------------- 1 | This directory contains the raw TCP flow reconstruction containing the desired captured file 2 | -------------------------------------------------------------------------------- /amico_scripts/stop_amico.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for i in $(ps ux | grep start_amico.py | grep -v grep | awk '{print $2}'); do 4 | kill $i; 5 | done 6 | -------------------------------------------------------------------------------- /file_dump/util/send_SIGTERM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for i in $(ps aux | grep file_dump | grep -v python | grep -v sudo | grep -v postgres | grep -v grep | awk '{print $2}'); do 4 | sudo kill -SIGTERM $i; 5 | done 6 | -------------------------------------------------------------------------------- /file_dump/util/send_SIGUSR2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for i in $(ps aux | grep file_dump | grep -v python | grep -v sudo | grep -v postgres | grep -v grep | awk '{print $2}'); do 4 | sudo kill -SIGUSR2 $i; 5 | done 6 | -------------------------------------------------------------------------------- /file_dump/util/send_SIGUSR1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for i in $(pgrep file_dump); do 4 | sudo kill -SIGUSR1 $i; 5 | done 6 | 7 | for i in $(ls zc98_*.log); do 8 | tail $i | egrep "(dropped|received)"; 9 | done 10 | -------------------------------------------------------------------------------- /file_dump/util/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # launch as ./util/start.sh 4 | 5 | for i in {0..8}; do 6 | python start_file_dump.py "zc:99@$i" >& zc99_$i.log & 7 | done 8 | 9 | sleep 1 10 | 11 | ./util/set_cpu_affinity.sh 2 12 | -------------------------------------------------------------------------------- /file_dump/mac_strnlen.c: -------------------------------------------------------------------------------- 1 | #ifdef __APPLE__ 2 | 3 | #include 4 | 5 | size_t strnlen(const char *s, size_t n) { 6 | int i; 7 | 8 | for(i=0; i /proc/irq/119/smp_affinity 9 | sudo echo 00000200 > /proc/irq/121/smp_affinity 10 | sudo echo 00000400 > /proc/irq/123/smp_affinity 11 | sudo echo 00000800 > /proc/irq/125/smp_affinity 12 | sudo echo 00001000 > /proc/irq/128/smp_affinity 13 | sudo echo 00002000 > /proc/irq/130/smp_affinity 14 | -------------------------------------------------------------------------------- /amico_scripts/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root, amico_logger 3 | 4 | [handlers] 5 | keys=fileDebugHandler,fileErrorHandler 6 | 7 | [formatters] 8 | keys=simpleFormatter 9 | 10 | [logger_root] 11 | level=ERROR 12 | handlers= 13 | 14 | [logger_amico_logger] 15 | level=DEBUG 16 | handlers=fileDebugHandler,fileErrorHandler 17 | qualname=amico_logger 18 | propagate=0 19 | 20 | [handler_fileErrorHandler] 21 | class=logging.handlers.RotatingFileHandler 22 | level=WARNING 23 | formatter=simpleFormatter 24 | args=("logs/amico_error.log", 'a', (5*1024*1024), 5) 25 | 26 | 27 | [handler_fileDebugHandler] 28 | class=logging.handlers.RotatingFileHandler 29 | level=DEBUG 30 | formatter=simpleFormatter 31 | args=("logs/amico_debug.log", 'a', (5*1024*1024), 5) 32 | 33 | [formatter_simpleFormatter] 34 | format=%(asctime)s - %(levelname)s - %(message)s 35 | -------------------------------------------------------------------------------- /file_dump/start_file_dump.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | from config import whitelist_subnets, manual_download_ip 4 | 5 | 6 | def print_usage(): 7 | print "Usage: sudo python start_pe_dump.py ethX" 8 | sys.exit() 9 | 10 | if len(sys.argv) < 2: 11 | print_usage() 12 | nic = sys.argv[1] 13 | 14 | bpf_filter = "\"tcp" 15 | 16 | if len(whitelist_subnets) > 0: 17 | bpf_filter += " and not (" 18 | for subnet in whitelist_subnets: 19 | bpf_filter += "net %s or " % (subnet,) 20 | bpf_filter = bpf_filter[:-4] 21 | bpf_filter += ")" 22 | 23 | if len(manual_download_ip) > 0: 24 | bpf_filter += " and not net %s" % (manual_download_ip,) 25 | 26 | bpf_filter += "\"" 27 | 28 | subprocess.call(""" 29 | ./file_dump -i %s -d dumps/ -A -J -G -f %s """ % 30 | (nic, bpf_filter), shell=True) 31 | -------------------------------------------------------------------------------- /file_dump/config.py.tmpl: -------------------------------------------------------------------------------- 1 | # The IP from which the manual downloads are happening should be listed here to 2 | # prevent infinite download loops. 3 | manual_download_ip = "" 4 | 5 | # The subnets hosting popular benign websites like Facebook, Google, MSN are 6 | # listed here and is fed to a BPF filter. This reduces the traffic load on 7 | # PE_DUMP 8 | whitelist_subnets = [ 9 | "69.171.224.0/20", 10 | "66.220.152.0/21", 11 | "74.125.0.0/16", 12 | "220.181.111.0/24", 13 | "123.125.114.0/24", 14 | "199.59.148.0/22", 15 | "65.54.94.0/23", 16 | "65.55.160.0/19", 17 | "65.55.192.0/18", 18 | "66.135.192.0/19", 19 | "157.166.224.0/20", 20 | "15.192.0.0/16", 21 | "143.166.0.0/17", 22 | "17.148.0.0/14", 23 | "192.150.16.0/23"] 24 | -------------------------------------------------------------------------------- /amico_scripts/db_cleanup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ########################################################################### 4 | # Copyright (C) 2011 Phani Vadrevu # 5 | # pvadrevu@uga.edu # 6 | # # 7 | # Distributed under the GNU Public License # 8 | # http://www.gnu.org/licenses/gpl.txt # 9 | # # 10 | # This program is free software; you can redistribute it and/or modify # 11 | # it under the terms of the GNU General Public License as published by # 12 | # the Free Software Foundation; either version 2 of the License, or # 13 | # (at your option) any later version. # 14 | # # 15 | ########################################################################### 16 | 17 | import psycopg2 18 | import config 19 | from config import * 20 | 21 | # Connect to database 22 | try: 23 | conn = psycopg2.connect("dbname=%s host=%s user=%s password=%s" 24 | %(db_name,db_host,db_user,db_password)) 25 | except: 26 | print "Unable to connect to database: "+db_name 27 | 28 | # Use Autocommit mode for database connection 29 | conn.set_isolation_level(0) 30 | cursor = conn.cursor() 31 | 32 | cursor.execute("""DROP TABLE IF EXISTS pe_dumps,virus_total_scans, 33 | ped_vts_mapping, manual_download_checksums,bgp2asn, 34 | weka_features, virus_total_submissions, amico_scores CASCADE""") 35 | print """Dropped the tables: pe_dumps,virus_total_scans,domain_whitelist, 36 | manual_download_checksums,bgp2asn, virus_total_submissions, 37 | amico_scores, weka_features, ped_vts_mapping""" 38 | cursor.close() 39 | conn.close() 40 | -------------------------------------------------------------------------------- /amico_scripts/postfile.py: -------------------------------------------------------------------------------- 1 | import httplib, mimetypes 2 | 3 | 4 | def post_multipart(host, selector, fields, files): 5 | """ 6 | Post fields and files to an http host as multipart/form-data. 7 | fields is a sequence of (name, value) elements for regular form fields. 8 | files is a sequence of (name, filename, value) elements for data to be uploaded as files 9 | Return the server's response page. 10 | """ 11 | content_type, body = encode_multipart_formdata(fields, files) 12 | h = httplib.HTTPSConnection(host, timeout=120) 13 | h.putrequest('POST', selector) 14 | h.putheader('content-type', content_type) 15 | h.putheader('content-length', str(len(body))) 16 | h.endheaders() 17 | h.send(body) 18 | return h.getresponse().read() 19 | 20 | 21 | def encode_multipart_formdata(fields, files): 22 | """ 23 | fields is a sequence of (name, value) elements for regular form fields. 24 | files is a sequence of (name, filename, value) elements for data to be uploaded as files 25 | Return (content_type, body) ready for httplib.HTTP instance 26 | """ 27 | BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$' 28 | CRLF = '\r\n' 29 | L = [] 30 | for (key, value) in fields: 31 | L.append('--' + BOUNDARY) 32 | L.append('Content-Disposition: form-data; name="%s"' % key) 33 | L.append('') 34 | L.append(value) 35 | for (key, filename, value) in files: 36 | L.append('--' + BOUNDARY) 37 | L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) 38 | L.append('Content-Type: %s' % get_content_type(filename)) 39 | L.append('') 40 | L.append(value) 41 | L.append('--' + BOUNDARY + '--') 42 | L.append('') 43 | body = CRLF.join(L) 44 | content_type = 'multipart/form-data; boundary=%s' % BOUNDARY 45 | return content_type, body 46 | 47 | 48 | def get_content_type(filename): 49 | return mimetypes.guess_type(filename)[0] or 'application/octet-stream' 50 | -------------------------------------------------------------------------------- /file_dump/README: -------------------------------------------------------------------------------- 1 | 2 | Usage: ./file_dump [-i NIC] [-r pcap_file] [-A] -d dump_dir [-f "pcap_filter"] [-L lru_cache_size] [-K max_dump_file_size (KB)] [-D debug_level] 3 | 4 | -i : Use to specify network interface (e.g., -i eth0) 5 | -r : Read from .pcap file instead of NIC (e.g., -r file.pcap) 6 | -A : If specified, this flag will turn off the on-the-fly srcIP anonymization 7 | -d : Director where raw HTTP respnoses containing reconstructed files are stored (e.g., -d ./dumps 8 | -f : Specify BPF filter (e.g., -f "tcp port 80") 9 | -L : Change LRU cache size (default = 10000 entries) 10 | -K : Change max accepted reconstructed file size, in KB (e.g., -K 1024) 11 | -D : Specify debug_level (value from 0-4) 12 | -J : extract JAR/APK files 13 | -E : extract ELF files 14 | -G : extract DMG files 15 | -Z : extract ZIP files 16 | -R : extract RAR files 17 | -P : extract PDF files 18 | -M : extract MS DOC files 19 | 20 | ==== 21 | 22 | NOTE: the IPs below have not been updated in a while... 23 | 24 | Suggested BPF filter to reduce load on file_dump packet analysis: 25 | 26 | BPF FILTER = tcp and not (net 69.171.224.0/20 or net 66.220.152.0/21 or net 74.125.0.0/16 or net 220.181.111.0/24 or net 123.125.114.0/24 or net 199.59.148.0/22 or net 65.54.94.0/23 or net 65.55.160.0/19 or net 65.55.192.0/18 or net 66.135.192.0/19 or net 157.166.224.0/20 or net 15.192.0.0/16 or net 143.166.0.0/17 or net 17.148.0.0/14 or net 192.150.16.0/23) 27 | 28 | 29 | Facebook 30 | 69.171.224.0/20 31 | 66.220.152.0/21 32 | 33 | Google/Youtube 34 | 74.125.0.0/16 35 | 36 | Baidu 37 | 220.181.111.0/24 38 | 123.125.114.0/24 39 | 40 | Twitter 41 | 199.59.148.0/22 42 | 43 | Microsoft/MSN/Live.com/Bing 44 | 65.54.94.0/23 45 | 65.55.160.0/19 46 | 65.55.192.0/18 47 | 48 | Ebay 49 | 66.135.192.0/19 50 | 51 | CNN 52 | 157.166.224.0/20 53 | 54 | HP 55 | 15.192.0.0/16 56 | 57 | DELL 58 | 143.166.0.0/17 59 | 60 | APPLE 61 | 17.148.0.0/14 62 | 63 | ADOBE 64 | 192.150.16.0/23 65 | 66 | -------------------------------------------------------------------------------- /file_dump/seq_list.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Roberto Perdisci (perdisci@cs.uga.edu) 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | typedef unsigned int u_int; 23 | 24 | 25 | ////////////////////////////////////////////////////////////// 26 | // This is an implementation of a siple list that holds pairs: 27 | // (sequence_number, payload_size) 28 | ////////////////////////////////////////////////////////////// 29 | 30 | typedef struct seq_list_entry { 31 | 32 | u_int sn; // sequence number 33 | u_int ps; // payload size 34 | struct seq_list_entry *next; 35 | 36 | } seq_list_entry_t; 37 | 38 | typedef struct seq_list { 39 | 40 | seq_list_entry_t *head; 41 | seq_list_entry_t *tail; 42 | seq_list_entry_t *next; 43 | 44 | } seq_list_t; 45 | 46 | seq_list_t* seq_list_init(void); 47 | void seq_list_destroy(seq_list_t* l, int mz_found); 48 | void seq_list_insert(seq_list_t *l, u_int i, u_int j); 49 | seq_list_entry_t *seq_list_head(seq_list_t *l); 50 | seq_list_entry_t *seq_list_tail(seq_list_t *l); 51 | seq_list_entry_t *seq_list_next(seq_list_t *l); 52 | void seq_list_restart_from_head(seq_list_t *l); 53 | void seq_list_restart_from_element(seq_list_t *l, seq_list_entry_t *e); 54 | u_int seq_list_get_seq_num(seq_list_entry_t *e); 55 | u_int seq_list_get_payload_size(seq_list_entry_t *e); 56 | void seq_list_print(seq_list_t *l); 57 | 58 | -------------------------------------------------------------------------------- /amico_scripts/etld.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Copyright (c) 2009 Michael Still 4 | # Released under the terms of the GNU GPL v2 5 | 6 | # Mozilla publishes a rule file which may be used to calculate effective TLDs 7 | # at: 8 | # 9 | # http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/src/ 10 | # effective_tld_names.dat?raw=1 11 | # 12 | # Use that file to take a domain name and return a (domain, etld) tuple. 13 | # Documentation for the rule file format is at: 14 | # 15 | # https://wiki.mozilla.org/Gecko:Effective_TLD_Service 16 | 17 | import re 18 | import sys 19 | import time 20 | 21 | class etld(object): 22 | """Helper to determine the effective TLD portion of a domain name.""" 23 | 24 | def __init__(self, datafile='effective_tld_names.dat'): 25 | """Load the data file ready for lookups.""" 26 | 27 | self.rules = {} 28 | 29 | file = open(datafile) 30 | line = file.readline() 31 | while line: 32 | line = line.rstrip() 33 | if line and not line.startswith('//'): 34 | tld = line.split('.')[-1] 35 | self.rules.setdefault(tld, []) 36 | self.rules[tld].append(re.compile(self.regexpize(line))) 37 | 38 | line = file.readline() 39 | file.close() 40 | 41 | def regexpize(self, line): 42 | """Convert a rule to regexp syntax.""" 43 | 44 | line = line[::-1].replace('.', '\\.').replace('*', '[^\\.]*').replace('!', '') 45 | return '^(%s)\.(.*)$' % line 46 | 47 | def parse(self, hostname): 48 | """Parse a hostanme into domain and etld portions.""" 49 | 50 | hostname = hostname.lower() 51 | tld = hostname.split('.')[-1] 52 | hostname = hostname[::-1] 53 | domain = '' 54 | etld = '' 55 | 56 | for rule in self.rules[tld]: 57 | m = rule.match(hostname) 58 | if m and m.group(1) > etld: 59 | domain = m.group(2)[::-1] 60 | etld = m.group(1)[::-1] 61 | 62 | if not etld: 63 | raise Exception('Parse failed') 64 | 65 | return (domain, etld) 66 | 67 | 68 | if __name__ == '__main__': 69 | e = etld() 70 | 71 | f = open(sys.argv[1]) 72 | l = f.readline() 73 | start_time = time.time() 74 | 75 | while l: 76 | try: 77 | l = l.rstrip() 78 | print '%s -> %s' %(l, e.parse(l)) 79 | except Exception, ex: 80 | print ex 81 | 82 | l = f.readline() 83 | 84 | print 'Took %f seconds' % (time.time() - start_time) 85 | f.close() 86 | -------------------------------------------------------------------------------- /amico_scripts/pe_extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ########################################################################### 4 | # Copyright (C) 2011 Roberto Perdisci # 5 | # perdisci@cs.uga.edu # 6 | # # 7 | # Distributed under the GNU Public License # 8 | # http://www.gnu.org/licenses/gpl.txt # 9 | # # 10 | # This program is free software; you can redistribute it and/or modify # 11 | # it under the terms of the GNU General Public License as published by # 12 | # the Free Software Foundation; either version 2 of the License, or # 13 | # (at your option) any later version. # 14 | # # 15 | ########################################################################### 16 | 17 | import sys, os 18 | import re 19 | from struct import unpack 20 | 21 | def prune_http_resp_headers(data): 22 | # this makes sure we find the actual start of the PE and not a random match 23 | m = re.search('\s\sMZ', data) 24 | if m: 25 | pos = m.start() 26 | data = data[pos:] 27 | 28 | # now we can start copyting data from MZ to the end 29 | m = re.search('MZ',data) 30 | if m: 31 | pos = m.start() 32 | return data[pos:] 33 | 34 | def is_pe_file(bin_data): 35 | 36 | if not bin_data: 37 | return False 38 | 39 | if len(bin_data) <= 0: 40 | return False 41 | 42 | m = re.search('MZ', bin_data) 43 | if m: 44 | p = m.start() 45 | offset = p + unpack('i', bin_data[p+0x3c:p+0x3c+4])[0] 46 | # print "p=", p, " offset=", offset 47 | if bin_data[p:p+2] == 'MZ' and bin_data[offset:offset+2] == 'PE': 48 | # print "This is a PE file!" 49 | return True 50 | 51 | print "This is NOT a PE file!" 52 | return False 53 | 54 | 55 | def usage(): 56 | print >> sys.stderr, 'usage: %s [-i device] [-r file] [pcap filter]' % sys.argv[0] 57 | sys.exit(1) 58 | 59 | 60 | def pe_extract(flow_file, dst=None): 61 | if not dst: 62 | dst = flow_file + '.exe' 63 | f = open(flow_file, 'rb') 64 | data = f.read() 65 | f.close() 66 | 67 | data = prune_http_resp_headers(data) 68 | 69 | if is_pe_file(data): 70 | print "Writing file:", flow_file+'.exe' 71 | f = open(dst, 'wb') 72 | f.write(data) 73 | f.close() 74 | return True 75 | 76 | print "Finished!" 77 | return False 78 | 79 | 80 | if __name__ == '__main__': 81 | pe_extract(sys.argv[1]) 82 | -------------------------------------------------------------------------------- /file_dump/lru-cache.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This is an implementation of a O(1) LRU cache. 3 | * Copyright (C) 2010 Roberto Perdisci (perdisci@cs.uga.edu) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | */ 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | #define MAX_LRUC_TTL 5*60 // 5 minutes 24 | 25 | typedef unsigned int u_int; 26 | 27 | typedef struct ht_entry { 28 | 29 | const char *key; 30 | struct lruc_entry *le; 31 | struct ht_entry *next; 32 | 33 | } ht_entry_t; 34 | 35 | typedef struct hash_table { 36 | 37 | u_int length; 38 | ht_entry_t **vect; 39 | 40 | } hash_table_t; 41 | 42 | typedef struct lruc_entry { 43 | 44 | char *key; 45 | void *value; 46 | time_t time; 47 | struct lruc_entry *prev; 48 | struct lruc_entry *next; 49 | 50 | } lruc_entry_t; 51 | 52 | typedef struct lru_cache { 53 | 54 | hash_table_t *ht; // pointer to the Hash Table for O(1) searches 55 | lruc_entry_t *top; // pointer to the pot of the LRU cache 56 | void (*destroy_val_fn)(void*); // callback function for destroying an entry value 57 | 58 | u_int num_entries; 59 | u_int max_entries; 60 | 61 | } lru_cache_t; 62 | 63 | hash_table_t* ht_init(u_int length); 64 | lru_cache_t* lruc_init_str(u_int max_entries); 65 | lru_cache_t* lruc_init(u_int max_entries, void (*destroy_val_fn)(void*)); 66 | 67 | void ht_insert(hash_table_t *ht, lruc_entry_t *lruc_e, const char *key); 68 | void ht_delete(hash_table_t *ht, const char *key); 69 | void ht_destroy(hash_table_t* ht); 70 | int lruc_insert_str(lru_cache_t *lruc, const char *key, const char* value); 71 | int lruc_insert(lru_cache_t *lruc, const char *key, void* value); 72 | void lruc_delete(lru_cache_t *lruc, const char *key); 73 | void lruc_destroy(lru_cache_t *lruc); 74 | 75 | lruc_entry_t* ht_search(const hash_table_t *ht, const char *key); 76 | char* lruc_search_str(lru_cache_t *lruc, const char *key); 77 | void* lruc_search(lru_cache_t *lruc, const char *key); 78 | 79 | u_int hash_fn(const char* key); 80 | u_int DJBHash(const char* str, u_int len); 81 | 82 | void print_ht(hash_table_t *ht); 83 | void print_lruc(lru_cache_t *lruc); 84 | void clean_lruc(lru_cache_t *lruc); 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /amico_scripts/features.py: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | # Copyright (C) 2013 Phani Vadrevu # 3 | # pvadrevu@uga.edu # 4 | # # 5 | # Distributed under the GNU Public License # 6 | # http://www.gnu.org/licenses/gpl.txt # 7 | # # 8 | # This program is free software; you can redistribute it and/or modify # 9 | # it under the terms of the GNU General Public License as published by # 10 | # the Free Software Foundation; either version 2 of the License, or # 11 | # (at your option) any later version. # 12 | # # 13 | ########################################################################### 14 | features = ( 15 | ["dump_id", 16 | "vt_month_shelf", 17 | "corrupt", 18 | "host_malware_downloads", 19 | "host_suspicious_downloads", 20 | "host_benign_downloads", 21 | "host_total_downloads", 22 | "host_malware_ratio", 23 | "host_suspicious_ratio", 24 | "host_benign_ratio", 25 | "host_avg_av_labels", 26 | "host_avg_trusted_labels", 27 | "host_unknown_hashes", 28 | "host_total_hashes", 29 | "host_unknown_hash_ratio", 30 | "twold_malware_downloads", 31 | "twold_suspicious_downloads", 32 | "twold_benign_downloads", 33 | "twold_total_downloads", 34 | "twold_malware_ratio", 35 | "twold_suspicious_ratio", 36 | "twold_benign_ratio", 37 | "twold_avg_av_labels", 38 | "twold_avg_trusted_labels", 39 | "twold_unknown_hashes", 40 | "twold_total_hashes", 41 | "twold_unknown_hash_ratio", 42 | "server_ip_malware_downloads", 43 | "server_ip_suspicious_downloads", 44 | "server_ip_benign_downloads", 45 | "server_ip_total_downloads", 46 | "server_ip_malware_ratio", 47 | "server_ip_suspicious_ratio", 48 | "server_ip_benign_ratio", 49 | "server_ip_avg_av_labels", 50 | "server_ip_avg_trusted_labels", 51 | "server_ip_unknown_hashes", 52 | "server_ip_total_hashes", 53 | "server_ip_unknown_hash_ratio", 54 | "bgp_malware_downloads", 55 | "bgp_suspicious_downloads", 56 | "bgp_benign_downloads", 57 | "bgp_total_downloads", 58 | "bgp_malware_ratio", 59 | "bgp_suspicious_ratio", 60 | "bgp_benign_ratio", 61 | "bgp_avg_av_labels", 62 | "bgp_avg_trusted_labels", 63 | "bgp_unknown_hashes", 64 | "bgp_total_hashes", 65 | "bgp_unknown_hash_ratio", 66 | "hash_life_time", 67 | "num_dumps_with_same_hash", 68 | "hash_daily_dump_rate_per_client", 69 | "estimated_clients_with_same_hash", 70 | "referer_exists", 71 | "host_name_exists", 72 | "extension_class", 73 | "url_length", 74 | "directory_depth", 75 | "sha1", 76 | "host", 77 | "url_malware_downloads", 78 | "url_total_downloads", 79 | "url_distinct_sha1s", 80 | "url_struct", 81 | "url_struct_malware_downloads", 82 | "url_struct_total_downloads", 83 | "url_struct_distinct_sha1s"]) 84 | -------------------------------------------------------------------------------- /amico_scripts/db_syslog.py: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | # copyright (c) 2011 phani vadrevu # 3 | # pvadrevu@uga.edu # 4 | # # 5 | # Distributed under the GNU Public License # 6 | # http://www.gnu.org/licenses/gpl.txt # 7 | # # 8 | # This program is free software; you can redistribute it and/or modify # 9 | # it under the terms of the GNU General Public License as published by # 10 | # the Free Software Foundation; either version 2 of the License, or # 11 | # (at your option) any later version. # 12 | # # 13 | ########################################################################### 14 | 15 | import sys 16 | from config import amico_threshold 17 | import syslog 18 | import time 19 | 20 | import util 21 | from util import reorder_domain 22 | 23 | # Wait for db_virus_total to complete 24 | WAIT_TIME = 60 25 | 26 | 27 | def make_syslog_entry(cursor, dump_id, score): 28 | # Database query to get the relevant record 29 | cursor.execute(""" 30 | SELECT timestamp, client, server, dst_port, host, url, referer, 31 | pe.sha1, pe.md5, file_size, num_av_labels, corrupt, file_type 32 | FROM pe_dumps as pe LEFT JOIN virus_total_scans as vts USING(sha1) 33 | WHERE (corrupt = 'false' OR num_av_labels IS NOT NULL) AND 34 | dump_id = %s 35 | ORDER BY vts.query_time DESC 36 | """ % (dump_id,)) 37 | if cursor.rowcount == 0: 38 | return 39 | log_data = list(cursor.fetchone()) 40 | log_data[4] = reorder_domain(log_data[4]) 41 | 42 | # if a score!=None is passed as argument, use that score, otherwise retrieve it from DB 43 | report = "-" 44 | 45 | if score is not None: 46 | score = float(score) # just to make sure we are dealing with real numbers and not a string ... 47 | if score > amico_threshold: 48 | report = "MALWARE" 49 | else: 50 | report = "BENIGN" 51 | report += "#%s#%s" % (score, amico_threshold) 52 | 53 | log_data.append(report) 54 | 55 | if log_data: 56 | #print log_data 57 | entry = ("file download -- timestamp: %s, client_ip: %s, server_ip:" 58 | " %s, server_port: %s, host: %s, url: %s, referrer: %s, sha1: %s, md5:" 59 | " %s, file_size: %s, av_labels: %s, corrupt: %s, file_type: %s, amico_score: %s" % 60 | tuple(log_data)) 61 | # syslog.syslog(syslog.LOG_ALERT,q) 62 | syslog.syslog(syslog.LOG_WARNING | syslog.LOG_USER, entry) 63 | 64 | 65 | def db_syslog(dump_id,score): 66 | time.sleep(WAIT_TIME) 67 | conn = util.connect_to_db() 68 | cursor = conn.cursor() 69 | make_syslog_entry(cursor, dump_id, score) 70 | cursor.close() 71 | conn.close() 72 | 73 | 74 | if __name__ == "__main__": 75 | dump_id = sys.argv[1] 76 | score = float(sys.argv[2]) 77 | db_syslog(dump_id,score) 78 | -------------------------------------------------------------------------------- /amico_scripts/config.py.tmpl: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | # Copyright (C) 2011 Phani Vadrevu # 3 | # pvadrevu@uga.edu # 4 | # # 5 | # Distributed under the GNU Public License # 6 | # http://www.gnu.org/licenses/gpl.txt # 7 | # # 8 | # This program is free software; you can redistribute it and/or modify # 9 | # it under the terms of the GNU General Public License as published by # 10 | # the Free Software Foundation; either version 2 of the License, or # 11 | # (at your option) any later version. # 12 | # # 13 | ########################################################################### 14 | 15 | # Postgres DB Info 16 | db_host = "localhost" 17 | db_name = "" 18 | db_user = "" 19 | db_password = "" 20 | 21 | # VirusTotal API Keys, as a list of Python Strings eg: ["abcd", "efgh"] 22 | # Get your VT API key at: https://www.virustotal.com/en/ 23 | vt_keys = [] 24 | 25 | # "live", "manual" or None 26 | # If vt_submission = "manual", fill the manual_download_ip variable in 27 | # pe_dump/config.py 28 | vt_submissions = "live" 29 | vt_submissions_ext = ['exe','apk','dmg','jar'] 30 | 31 | trusted_av_vendors = ["Avast", "AVG", "F-Secure", "Kaspersky", "McAfee", 32 | "Microsoft", "Sophos", "Symantec", "TrendMicro"] 33 | 34 | 35 | # file types that should be captured and stored on disk 36 | # see extract_file.py for available types 37 | capture_file_types = ["EXE"] 38 | 39 | 40 | # Manual downloads are only enabled if vt_submissions = "manual" 41 | # Every time a download is detected using the URL from which the executable has 42 | # been downloaded, a new HTTP request is created and the file is refetched from 43 | # the same webserver. We refer to this as a manual download. Please specify the 44 | # directory where these manual downloads should be saved 45 | MAN_DOWNLOAD_DIR = "manual_downloads" 46 | LIVE_DOWNLOAD_DIR = "parsed/captured_files" 47 | 48 | # parameters for feature compuation based on past activity 49 | MAX_PAST_DUMPS = 100000 # used to only go back up to the past MAX_PAST_DUMPS dumps 50 | MAX_PAST_DAYS = 30 # go back in time MAX_PAST_DAYS days max 51 | 52 | # When running AMICO under a SOCKS proxy, use these. 53 | # Else, have socks_proxy_host as "None" 54 | #socks_proxy_host = "localhost" 55 | socks_proxy_host = None 56 | socks_proxy_port = 12345 57 | 58 | 59 | # The threshold value for classification between (0,1) 60 | # used in db_syslog.py script 61 | amico_threshold = 0.4 62 | 63 | # The name of the training model file to be used for 64 | # classification. Use the trainer.py script to create 65 | # a new model specific to your network 66 | model_file = "models/default.model" 67 | 68 | whitelist_domains = [ 69 | "windowsupdate.com", 70 | "avg.com", 71 | "microsoft.com", 72 | "adobe.com", 73 | "apple.com", 74 | "google.com", 75 | "se.360.cn"] 76 | -------------------------------------------------------------------------------- /file_dump/search.c: -------------------------------------------------------------------------------- 1 | /* 2 | * NOTE ON LICENCING FOR THIS FILE 3 | * The following code was borrowed from 4 | * Wikipedia.org 5 | * For example it can be found here: 6 | * http://en.wikipedia.org/w/index.php?title=Boyer%E2%80%93Moore_string_search_algorithm&oldid=399934077 7 | */ 8 | 9 | # include 10 | # include 11 | 12 | # define ALPHABET_SIZE (1 << CHAR_BIT) 13 | 14 | static void compute_prefix(const char* str, size_t size, int result[size]) { 15 | size_t q; 16 | int k; 17 | result[0] = 0; 18 | 19 | k = 0; 20 | for (q = 1; q < size; q++) { 21 | while (k > 0 && str[k] != str[q]) 22 | k = result[k-1]; 23 | 24 | if (str[k] == str[q]) 25 | k++; 26 | result[q] = k; 27 | } 28 | } 29 | 30 | static void prepare_badcharacter_heuristic(const char *str, size_t size, 31 | int result[ALPHABET_SIZE]) { 32 | 33 | size_t i; 34 | 35 | for (i = 0; i < ALPHABET_SIZE; i++) 36 | result[i] = -1; 37 | 38 | for (i = 0; i < size; i++) 39 | result[(size_t) str[i]] = i; 40 | } 41 | 42 | void prepare_goodsuffix_heuristic(const char *normal, size_t size, 43 | int result[size + 1]) { 44 | 45 | char *left = (char *) normal; 46 | char *right = left + size; 47 | char reversed[size+1]; 48 | char *tmp = reversed + size; 49 | size_t i; 50 | 51 | /* reverse string */ 52 | *tmp = 0; 53 | while (left < right) 54 | *(--tmp) = *(left++); 55 | 56 | int prefix_normal[size]; 57 | int prefix_reversed[size]; 58 | 59 | compute_prefix(normal, size, prefix_normal); 60 | compute_prefix(reversed, size, prefix_reversed); 61 | 62 | for (i = 0; i <= size; i++) { 63 | result[i] = size - prefix_normal[size-1]; 64 | } 65 | 66 | for (i = 0; i < size; i++) { 67 | const int j = size - prefix_reversed[i]; 68 | const int k = i - prefix_reversed[i]+1; 69 | 70 | if (result[j] > k) 71 | result[j] = k; 72 | } 73 | } 74 | /* 75 | * Boyer-Moore search algorithm 76 | */ 77 | const char *boyermoore_search(const char *haystack, const char *needle) { 78 | /* 79 | * Calc string sizes 80 | */ 81 | size_t needle_len, haystack_len; 82 | needle_len = strlen(needle); 83 | haystack_len = strlen(haystack); 84 | 85 | /* 86 | * Simple checks 87 | */ 88 | if(haystack_len == 0) 89 | return NULL; 90 | if(needle_len == 0) 91 | return haystack; 92 | 93 | /* 94 | * Initialize heuristics 95 | */ 96 | int badcharacter[ALPHABET_SIZE]; 97 | int goodsuffix[needle_len+1]; 98 | 99 | prepare_badcharacter_heuristic(needle, needle_len, badcharacter); 100 | prepare_goodsuffix_heuristic(needle, needle_len, goodsuffix); 101 | 102 | /* 103 | * Boyer-Moore search 104 | */ 105 | size_t s = 0; 106 | while(s <= (haystack_len - needle_len)) 107 | { 108 | size_t j = needle_len; 109 | while(j > 0 && needle[j-1] == haystack[s+j-1]) 110 | j--; 111 | 112 | if(j > 0) 113 | { 114 | int k = badcharacter[(size_t) haystack[s+j-1]]; 115 | int m; 116 | if(k < (int)j && (m = j-k-1) > goodsuffix[j]) 117 | s+= m; 118 | else 119 | s+= goodsuffix[j]; 120 | } 121 | else 122 | { 123 | return haystack + s; 124 | } 125 | } 126 | 127 | /* not found */ 128 | return NULL; 129 | } 130 | -------------------------------------------------------------------------------- /amico_scripts/update_urls_fix.py: -------------------------------------------------------------------------------- 1 | # Author: Phani Vadrevu 2 | # 3 | # This script fixes a bug related to empty URLs in Amico's DB 4 | # It reparses raw file dumps to fill missing URLs 5 | # It should only be used to correct missing URLs produced 6 | # by the version of Amico's code before "dev" branch commit 7 | # b1d39fcf158441af61a59a571b342e9826a46c9d 8 | 9 | import logging 10 | import re 11 | import os 12 | 13 | import util 14 | 15 | RAW_FILE_DIR = "/home/perdisci/amico/amico_scripts/parsed/raw_files/" 16 | LOG_FILE = "/home/perdisci/amico/amico_scripts/parsed/update_urls_amico.log" 17 | 18 | def update_url(file_path,conn): 19 | #print "Time b4 http parsing: %f" %(time.time(),) 20 | # Use Autocommit mode for database connection 21 | 22 | fileHandle = open(file_path) 23 | 24 | # Timestamp 25 | r = re.compile('[0-9]+') 26 | timestamp = r.search(fileHandle.readline()) 27 | if timestamp is not None: 28 | timestamp = timestamp.group() 29 | #print timestamp.group() 30 | 31 | # Source and Destination IPs 32 | r = re.compile('([0-9.]+):.*-([0-9.]+):([0-9]+)-.*') 33 | ip = r.search(fileHandle.readline()) 34 | if ip is not None: 35 | srcip = ip.group(2) 36 | dstip = ip.group(1) 37 | dst_port = ip.group(3) 38 | #print ip.group(1) 39 | #print ip.group(2) 40 | else: 41 | srcip = None 42 | dstip = None 43 | dst_port = None 44 | 45 | # URL 46 | # for efficiency purposes, skip files that were not affected by the bug 47 | url_line = fileHandle.readline() 48 | if " HTTP/1" in url_line: 49 | return 50 | 51 | r = re.compile('(GET|POST|HEAD) (.*)') 52 | url = r.search(url_line) 53 | if url is not None: 54 | method = url.group(1) 55 | method = method[:10] 56 | url = url.group(2) 57 | toks = url.split() 58 | url = toks[0] 59 | #print url.group(1) 60 | else: 61 | method = None 62 | 63 | if url is None or len(url.strip())==0: 64 | logging.warning('URL is empty for file: %s' % (file_path,)) 65 | return 66 | 67 | 68 | cursor = conn.cursor() 69 | 70 | cursor.execute(""" 71 | SELECT dump_id FROM pe_dumps 72 | WHERE timestamp = TO_TIMESTAMP(%s) AND server = %s AND client = %s 73 | AND dst_port = %s AND url IS NULL """, (timestamp, srcip, dstip, dst_port)) 74 | if cursor.rowcount > 1: 75 | logging.warning('Found more than one dump_id for file: %s' % (file_path,)) 76 | # elif cursor.rowcount == 0: 77 | # logging.warning('Found no dump_id for file: %s', (file_path,)) 78 | elif cursor.rowcount == 1: 79 | dump_id = cursor.fetchone() 80 | if len(url.strip())>0: 81 | cursor.execute(""" 82 | UPDATE pe_dumps SET url = %s 83 | WHERE dump_id = %s """, (url.strip(), dump_id)) 84 | logging.debug('Updated URL for dump_id: %s (file: %s | url: %s)' % (dump_id,file_path,url)) 85 | 86 | 87 | def main(): 88 | conn = util.connect_to_db() 89 | 90 | logging.basicConfig(level=logging.DEBUG, 91 | filename=LOG_FILE, 92 | filemode='w') 93 | raw_file_names = os.listdir(RAW_FILE_DIR) 94 | for fn in raw_file_names: 95 | file_path = os.path.join(RAW_FILE_DIR, fn) 96 | print "Analyzing file:", file_path 97 | update_url(file_path,conn) 98 | 99 | 100 | if __name__ == "__main__": 101 | main() 102 | -------------------------------------------------------------------------------- /amico_scripts/vt_api.py: -------------------------------------------------------------------------------- 1 | 2 | # Author: Phani Vadrevu 3 | 4 | import os.path 5 | import urllib 6 | import urllib2 7 | import random 8 | 9 | import postfile 10 | import config 11 | from config import * 12 | 13 | TIMEOUT = 10 14 | 15 | 16 | def get_vt_key(): 17 | #random.seed() 18 | k = random.randint(0, len(vt_keys) - 1) 19 | print "Using VT API key number", k 20 | return vt_keys[k] # vt_keys must be a list of valid virust_total API keys 21 | 22 | 23 | def send_file(md5): 24 | host = "www.virustotal.com" 25 | selector = "https://www.virustotal.com/vtapi/v2/file/scan" 26 | fields = [("apikey", get_vt_key())] 27 | 28 | dir_path = "" 29 | if vt_submissions == "manual": 30 | dir_path = MAN_DOWNLOAD_DIR 31 | else: 32 | dir_path = LIVE_DOWNLOAD_DIR 33 | 34 | # just a patch to old code... 35 | # we only submit the first file that matches 36 | # it is anyway highly unlikely that more than one would match 37 | file_name = None 38 | file_path = None 39 | for ext in vt_submissions_ext: 40 | for e in [ext.lower(),ext.upper()]: 41 | fn = md5 + "." + e 42 | fp = os.path.join(dir_path,fn) 43 | if os.path.isfile(fp): 44 | file_name = fn 45 | file_path = fp 46 | break; 47 | 48 | if file_path and os.path.isfile(file_path): 49 | print "VT file submission:", file_path 50 | file_to_send = open(file_path, "rb").read() 51 | files = [("file", file_name, file_to_send)] 52 | json = postfile.post_multipart(host, selector, fields, files) 53 | return json 54 | 55 | 56 | # Either a singe hash or a list of hashes (upto 25) can be passed 57 | def rescan_request(arg): 58 | if isinstance(arg, list): 59 | res = "" 60 | for file_hash in arg: 61 | res += file_hash + ', ' 62 | res = res[:-2] 63 | else: 64 | res = arg 65 | url = "https://www.virustotal.com/vtapi/v2/file/rescan" 66 | parameters = {"resource": res, 67 | "apikey": get_vt_key()} 68 | data = urllib.urlencode(parameters) 69 | req = urllib2.Request(url, data) 70 | try: 71 | response = urllib2.urlopen(req, timeout=5*TIMEOUT) 72 | except Exception as e: 73 | print "rescan_request: Exception occured", e 74 | return 75 | json = response.read() 76 | return json 77 | 78 | 79 | # md5 or sha1 can also be used instead of scan_id 80 | def get_vt_report(scan_id): 81 | url = "https://www.virustotal.com/vtapi/v2/file/report" 82 | parameters = {"resource": scan_id, 83 | "apikey": get_vt_key()} 84 | data = urllib.urlencode(parameters) 85 | req = urllib2.Request(url, data) 86 | try: 87 | response = urllib2.urlopen(req, timeout=TIMEOUT) 88 | except Exception as e: 89 | print "get_vt_report: Exception occured", e 90 | return 91 | json = response.read() 92 | return json 93 | 94 | 95 | def get_ip_report(ip): 96 | url = "https://www.virustotal.com/vtapi/v2/ip-address/report" 97 | parameters = {"ip": ip, 98 | "apikey": get_vt_key()} 99 | data = urllib.urlencode(parameters) 100 | req = urllib2.Request("%s?%s" % (url, data)) 101 | try: 102 | response = urllib2.urlopen(req, timeout=TIMEOUT) 103 | except Exception as e: 104 | print "get_vt_report: Exception occured", e 105 | return 106 | json = response.read() 107 | return json 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview # 2 | 3 | **AMICO** is a malware download classification tool that can be deployed in large networks. It reconstructs executable files (e.g., EXE, DLL, DMG, APK, JAR, etc.) from the network traffic and determines if they are malicious or not based on their _provenance information_. 4 | 5 | To classify a file download event, AMICO looks at **who** is downloading **what** and from **where**, rather than analyzing the content of the downloaded files. 6 | 7 | For more technical information, please refer to this [ESORICS 2013 research paper](http://www.perdisci.com/publications/publication-files/amico.pdf) 8 | 9 | **Code**: The latest code, which reconstructs and classifies file dumps other than Windows PE executables, is in the `dev` branch (the `older_code` branch contains the original code dedicated to reconstructing only Windows PE files). 10 | 11 | For more information on how to use and deploy AMICO, please go through the Wiki pages. This is an initial release of the system and we will keep refining the code and documentation. Please open a new Issue if you experience any problems. 12 | 13 | You can also visit our [AMICO-Security Blog](http://amico-security.blogspot.com/). 14 | 15 | 16 | ## SETUP AND DEPLOYMENT ## 17 | Please refer to our [project's Wiki](https://github.com/perdisci/amico/wiki) for detailed information about system requirements, setup, and deployment guidelines. 18 | 19 | 20 | ## CONTACT US ## 21 | If you have any questions, please post a message on our [AMICO-security forum](https://groups.google.com/forum/#!forum/amico-security). 22 | 23 | If you are deploying AMICO in a large _university-like campus network_ and would like to share your experience or know more about our own deployment, please contact us privately at (**perdisci [-at-] cs.uga.edu**). 24 | 25 | 26 | ## LICENSING ## 27 | The code under the "older_code" branch is released under BSD license. Please refer to the COPYING file under that branch for details. 28 | 29 | ## News ## 30 | * [01/17/2017] Written [some guidelines](https://amico-security.blogspot.com/2017/01/installing-pfring.html) on how to install pf_ring and ZC drivers 31 | * [01/11/2016] Enabled submission of file types other than EXE to VirusTotal (in the experimental branch only). 32 | * [04/29/2015] Improved [experimental branch code](https://github.com/perdisci/amico/tree/experimental), and tested capture and classification of APKs and JARs in a large network. 33 | * [03/27/2015] All code in the master branch has been released under **BSD license**. 34 | * [03/27/2015] Moved all project files from GoogleCode to GitHub. 35 | * [01/14/2015] Added some documentation about [syslog reports format](https://github.com/perdisci/amico/wiki/Syslog-Reports-Format). 36 | * [11/20/2014] Added experimental code for supporting file formats other than Windows PE (see svn/branches/experimental). We can currently extract most JAR, APK, DMG, ZIP, RAR, PDF files, and even some Microsoft Office documents. _Limitations_: the feature extraction and provenance classifier currently treat all file types the same way; we are performing more research to see if the behavior-based detection approach used by AMICO can still work well even with non-executable files. 37 | * [11/08/2014] We have created the [AMICO-Security Blog](http://amico-security.blogspot.com/), where we discuss malware campaign discoveries and other related topics. 38 | * [10/09/2014] Quick steps for [tuning packet capture](https://github.com/perdisci/amico/wiki/Tuning-Packet-Capture) and drastically reduce packet loss. 39 | * [10/03/2014] Added a brief [example of how AMICO can be deployed](https://github.com/perdisci/amico/wiki/Deployment-Example) in a network. 40 | * [09/15/2014] We recently fixed a number of rarely-triggered bugs and improved general code quality and stability. 41 | * [09/13/2014] In the Wiki, you can now find more information about the [pe\_dump](https://github.com/perdisci/amico/wiki/pe_dump-Module) component of AMICO. 42 | * [08/26/2014] We successfully built a PF\_RING-aware version of AMICO (see [how we did it](https://github.com/perdisci/amico/blob/master/external_libs/README)) 43 | -------------------------------------------------------------------------------- /amico_scripts/classify_dump.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | ########################################################################### 3 | # Copyright (C) 2013 Phani Vadrevu # 4 | # pvadrevu@uga.edu # 5 | # # 6 | # Distributed under the GNU Public License # 7 | # http://www.gnu.org/licenses/gpl.txt # 8 | # # 9 | # This program is free software; you can redistribute it and/or modify # 10 | # it under the terms of the GNU General Public License as published by # 11 | # the Free Software Foundation; either version 2 of the License, or # 12 | # (at your option) any later version. # 13 | # # 14 | ########################################################################### 15 | import sys 16 | import subprocess 17 | 18 | import psycopg2.extras 19 | 20 | import util 21 | from features import features 22 | from config import model_file 23 | 24 | output_file = "test.arff" 25 | 26 | 27 | def print_arff(dump_id): 28 | conn = util.connect_to_db() 29 | cursor = conn.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor) 30 | cursor.execute(""" 31 | SELECT * FROM weka_features 32 | WHERE dump_id = %s""", 33 | (dump_id, )) 34 | if cursor.rowcount == 0: 35 | print "Feature vector not found. Exiting..." 36 | return 37 | res = cursor.fetchone() 38 | res = res._asdict() 39 | del res['raw_dump_num_av_labels'] 40 | del res['raw_dump_trusted_av_labels'] 41 | 42 | w = open(output_file, 'w') 43 | w.write('@RELATION test\n\n') 44 | values = [] 45 | for feature in features: 46 | if feature in ['sha1', 'dump_id', 'host', 'corrupt', 'vt_month_shelf', 47 | 'url_struct']: 48 | data_type = "STRING" 49 | elif feature == "extension_class": 50 | data_type = ("{common_ext,unknown_ext,common_fake,other_ext," 51 | "no_url,no_ext}") 52 | else: 53 | data_type = "NUMERIC" 54 | w.write('@ATTRIBUTE %s %s\n' % (feature, data_type)) 55 | values.append(res[feature]) 56 | #print "%s : %s" % (key, res[key]) 57 | 58 | w.write('@ATTRIBUTE class {pos, neg}\n\n') 59 | w.write('@DATA\n\n') 60 | try: 61 | data_string = ','.join(['?' if (value is None or value is '') else 62 | str(value) for value in values]) 63 | except Exception as e: 64 | print "Error in writing feature vector to file!", e 65 | else: 66 | data_string += ",?" 67 | w.write(data_string + '\n') 68 | w.close() 69 | cursor.close() 70 | conn.close() 71 | 72 | 73 | def classify_dump(dump_id): 74 | print_arff(dump_id) 75 | subprocess.call( 76 | "java -Xmx2000m -cp ./weka.jar " 77 | "weka.classifiers.meta.FilteredClassifier " 78 | "-l %s -p 1,58,59 -distribution -T test.arff " 79 | "> test.result" % (model_file,), shell=True) 80 | 81 | 82 | score = None 83 | with open('test.result', 'r') as f: 84 | for line in f: 85 | if ':' in line: 86 | for word in line.split(): 87 | if '*' in word: 88 | score = word.split(',')[0] 89 | if score.startswith('*'): 90 | score = score[1:] 91 | subprocess.call("rm test.arff", shell=True) 92 | subprocess.call("rm test.result", shell=True) 93 | 94 | print "AMICO Score:", score 95 | update_score(dump_id,score) 96 | 97 | return score 98 | 99 | 100 | def update_score(dump_id,score): 101 | conn = util.connect_to_db() 102 | cursor = conn.cursor() 103 | cursor.execute(""" 104 | DELETE FROM amico_scores 105 | WHERE dump_id = %s""", 106 | (dump_id, )) 107 | cursor.execute("INSERT INTO amico_scores VALUES " 108 | "(%s, %s)", (dump_id, score)) 109 | 110 | 111 | 112 | if __name__ == "__main__": 113 | dump_id = int(sys.argv[1]) 114 | #print_arff(dump_id) # For testing 115 | classify_dump(dump_id) 116 | -------------------------------------------------------------------------------- /amico_scripts/util.py: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | # Copyright (C) 2012 Phani Vadrevu # 3 | # pvadrevu@uga.edu # 4 | # # 5 | # Distributed under the GNU Public License # 6 | # http://www.gnu.org/licenses/gpl.txt # 7 | # # 8 | # This program is free software; you can redistribute it and/or modify # 9 | # it under the terms of the GNU General Public License as published by # 10 | # the Free Software Foundation; either version 2 of the License, or # 11 | # (at your option) any later version. # 12 | # # 13 | ########################################################################### 14 | 15 | """ 16 | Utitily functions should be added here 17 | """ 18 | import re 19 | import socket 20 | import socks 21 | import psycopg2 22 | import etld 23 | 24 | from config import * 25 | 26 | 27 | def connect_to_db(): 28 | try: 29 | conn = psycopg2.connect("dbname=%s host=%s user=%s password=%s" 30 | % (db_name, db_host, db_user, db_password)) 31 | except Exception as e: 32 | print "Unable to connect to database: " + db_name 33 | print e 34 | conn.set_isolation_level(0) 35 | return conn 36 | 37 | 38 | # Reorder the subdomains in the host name such that 39 | # the TLD comes first. Eg: com.google.www 40 | def reorder_domain(host): 41 | if host is None: 42 | return None 43 | 44 | try: 45 | host = host.split(':')[0] # in case host string contains port 46 | ipreg = re.compile("[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$") 47 | if ipreg.match(host) is None: 48 | ordered_host = "" 49 | host += '.' 50 | domains = re.findall('.*?\.', host) 51 | for i in range(len(domains)): 52 | ordered_host += domains[len(domains) - i - 1] 53 | ordered_host = ordered_host[:-1] 54 | return ordered_host 55 | else: 56 | return host 57 | except Exception as e: 58 | print "exception in reorder_domain for host: %s" % (host,) 59 | print e 60 | return host 61 | 62 | 63 | def is_ip(string): 64 | ipreg = re.compile("[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$") 65 | if ipreg.match(string) is not None: 66 | return True 67 | else: 68 | return False 69 | 70 | 71 | def extract_extension(url): 72 | file_name = url.split('?')[0].split('/')[-1] 73 | if '.' in file_name: 74 | ext = file_name.split('.')[-1] 75 | return ext 76 | else: 77 | return None 78 | 79 | 80 | def extract_twold(hostname): 81 | if hostname is None: 82 | return None 83 | 84 | hostname = hostname.strip() 85 | if len(hostname) == 0: 86 | return None 87 | if isIP4Address(hostname): 88 | return None 89 | 90 | try: 91 | etld_obj = etld.etld() 92 | registered = '' 93 | suffix = '' 94 | registered, suffix = etld_obj.parse(hostname) 95 | twold = '.'.join([registered.split('.')[-1], suffix]) 96 | print "hostname: %s -- twold: %s" % (hostname,twold) 97 | return twold 98 | except: 99 | print "Unable to compute twold: hostname: %s" % (hostname,) 100 | 101 | return None 102 | 103 | def isIP4Address(hostname): 104 | ip4reg = re.compile( 105 | "([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})$") 106 | m = ip4reg.match(hostname) 107 | if m is not None: 108 | return True 109 | 110 | # Reverse the IP address for querying origin.asn.cymru.com 111 | def reverse_ip(ip): 112 | ipreg = re.compile( 113 | "([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})$") 114 | m = ipreg.match(ip) 115 | if m is not None: 116 | return (m.group(4) + "." + m.group(3) + "." + m.group(2) 117 | + "." + m.group(1)) 118 | 119 | 120 | # Setup SOCKS proxy 121 | def setup_socks(): 122 | if socks_proxy_host: 123 | socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, socks_proxy_host, 124 | socks_proxy_port) 125 | socket.socket = socks.socksocket 126 | -------------------------------------------------------------------------------- /amico_scripts/fe_db_setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ########################################################################### 3 | # Copyright (C) 2012 Phani Vadrevu # 4 | # pvadrevu@uga.edu # 5 | # # 6 | # Distributed under the GNU Public License # 7 | # http://www.gnu.org/licenses/gpl.txt # 8 | # # 9 | # This program is free software; you can redistribute it and/or modify # 10 | # it under the terms of the GNU General Public License as published by # 11 | # the Free Software Foundation; either version 2 of the License, or # 12 | # (at your option) any later version. # 13 | # # 14 | ########################################################################### 15 | import util 16 | import sys 17 | from config import * 18 | 19 | def fe_db_setup(): 20 | conn = util.connect_to_db() 21 | cursor = conn.cursor() 22 | 23 | cursor.execute(""" DROP table if exists features""") 24 | cursor.execute(""" DROP table if exists weka_features""") 25 | cursor.execute(""" 26 | CREATE TABLE weka_features( 27 | dump_id INT, 28 | raw_dump_num_av_labels INT, 29 | raw_dump_trusted_av_labels INT, 30 | vt_month_shelf BOOLEAN, 31 | corrupt BOOLEAN, 32 | host_malware_downloads INT, 33 | host_suspicious_downloads INT, 34 | host_benign_downloads INT, 35 | host_total_downloads INT, 36 | host_malware_ratio REAL, 37 | host_suspicious_ratio REAL, 38 | host_benign_ratio REAL, 39 | host_avg_av_labels REAL, 40 | host_avg_trusted_labels REAL, 41 | host_unknown_hashes INT, 42 | host_total_hashes INT, 43 | host_unknown_hash_ratio REAL, 44 | twold_malware_downloads INT, 45 | twold_suspicious_downloads INT, 46 | twold_benign_downloads INT, 47 | twold_total_downloads INT, 48 | twold_malware_ratio REAL, 49 | twold_suspicious_ratio REAL, 50 | twold_benign_ratio REAL, 51 | twold_avg_av_labels REAL, 52 | twold_avg_trusted_labels REAL, 53 | twold_unknown_hashes INT, 54 | twold_total_hashes INT, 55 | twold_unknown_hash_ratio REAL, 56 | server_ip_malware_downloads INT, 57 | server_ip_suspicious_downloads INT, 58 | server_ip_benign_downloads INT, 59 | server_ip_total_downloads INT, 60 | server_ip_malware_ratio REAL, 61 | server_ip_suspicious_ratio REAL, 62 | server_ip_benign_ratio REAL, 63 | server_ip_avg_av_labels REAL, 64 | server_ip_avg_trusted_labels REAL, 65 | server_ip_unknown_hashes INT, 66 | server_ip_total_hashes INT, 67 | server_ip_unknown_hash_ratio REAL, 68 | bgp_malware_downloads INT, 69 | bgp_suspicious_downloads INT, 70 | bgp_benign_downloads INT, 71 | bgp_total_downloads INT, 72 | bgp_malware_ratio REAL, 73 | bgp_suspicious_ratio REAL, 74 | bgp_benign_ratio REAL, 75 | bgp_avg_av_labels REAL, 76 | bgp_avg_trusted_labels REAL, 77 | bgp_unknown_hashes INT, 78 | bgp_total_hashes INT, 79 | bgp_unknown_hash_ratio REAL, 80 | hash_life_time INT, 81 | num_dumps_with_same_hash INT, 82 | hash_daily_dump_rate_per_client REAL, 83 | estimated_clients_with_same_hash INT, 84 | referer_exists INT, 85 | host_name_exists INT, 86 | extension_class VARCHAR(20), 87 | url_length INT, 88 | directory_depth INT, 89 | sha1 VARCHAR(40), 90 | host VARCHAR(256), 91 | url_malware_downloads INT, 92 | url_total_downloads INT, 93 | url_distinct_sha1s INT, 94 | url_struct VARCHAR(512), 95 | url_struct_malware_downloads INT, 96 | url_struct_total_downloads INT, 97 | url_struct_distinct_sha1s INT) 98 | """) 99 | 100 | print "Created weka_features table!" 101 | 102 | conn.commit() 103 | cursor.close() 104 | conn.close() 105 | 106 | if __name__ == '__main__': 107 | sys.exit(main()) 108 | -------------------------------------------------------------------------------- /amico_scripts/db_pe_dumps.py: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | # Copyright (C) 2011 Phani Vadrevu # 3 | # pvadrevu@uga.edu # 4 | # # 5 | # Distributed under the GNU Public License # 6 | # http://www.gnu.org/licenses/gpl.txt # 7 | # # 8 | # This program is free software; you can redistribute it and/or modify # 9 | # it under the terms of the GNU General Public License as published by # 10 | # the Free Software Foundation; either version 2 of the License, or # 11 | # (at your option) any later version. # 12 | # # 13 | ########################################################################### 14 | 15 | import re 16 | import sys 17 | from config import * 18 | 19 | import util 20 | 21 | def db_pe_dumps(file_path, sha1, md5, file_size): 22 | #print "Time b4 http parsing: %f" %(time.time(),) 23 | # Use Autocommit mode for database connection 24 | conn = util.connect_to_db() 25 | cursor = conn.cursor() 26 | 27 | fileHandle = open(file_path) 28 | 29 | # Timestamp 30 | r = re.compile('[0-9]+') 31 | timestamp = r.search(fileHandle.readline()) 32 | if timestamp is not None: 33 | timestamp = timestamp.group() 34 | #print timestamp.group() 35 | 36 | # Source and Destination IPs 37 | r = re.compile('([0-9.]+):.*-([0-9.]+):([0-9]+)-.*') 38 | ip = r.search(fileHandle.readline()) 39 | if ip is not None: 40 | srcip = ip.group(2) 41 | dstip = ip.group(1) 42 | dst_port = ip.group(3) 43 | #print ip.group(1) 44 | #print ip.group(2) 45 | else: 46 | srcip = None 47 | dstip = None 48 | dst_port = None 49 | 50 | # URL 51 | r = re.compile('(GET|POST|HEAD) (.*) ') 52 | url = r.search(fileHandle.readline()) 53 | if url is not None: 54 | method = url.group(1) 55 | method = method[:10] 56 | url = url.group(2) 57 | #print url.group(1) 58 | else: 59 | method = None 60 | 61 | 62 | # Host 63 | r = re.compile('Host: (.*)') 64 | host = r.search(fileHandle.readline()) 65 | if host is not None: 66 | host = host.group(1) 67 | host = util.reorder_domain(host.strip()) 68 | #print host.group(1) 69 | 70 | 71 | # Referer 72 | r = re.compile('Referer: (.*)') 73 | referer = r.search(fileHandle.readline()) 74 | if referer is not None: 75 | referer = referer.group(1) 76 | #print referrer.group(1) 77 | 78 | 79 | # CORRUPT_PE 80 | corrupt_pe = False 81 | r = re.compile('CORRUPT_(PE|FILE)') 82 | corrupt_pe_str = r.search(fileHandle.readline()) 83 | if corrupt_pe_str is not None: 84 | corrupt_pe = True 85 | 86 | 87 | # Now, parse data from the response 88 | # Server 89 | data = fileHandle.read() 90 | r = re.compile('Server: (.*)') 91 | server = r.search(data) 92 | if server is not None: 93 | server = server.group(1) 94 | server = server.rstrip('\r') 95 | server = server[:64] 96 | 97 | # Content-Type 98 | r = re.compile('Content-Type: (.*)') 99 | cont_type = r.search(data) 100 | if cont_type is not None: 101 | cont_type = cont_type.group(1) 102 | cont_type = cont_type.rstrip('\r') 103 | cont_type = cont_type[:128] 104 | 105 | #print "Time after http parsing: %f" %(time.time(),) 106 | # Database statement 107 | cursor.execute(""" 108 | INSERT INTO pe_dumps(sha1,md5,timestamp,server,client,method,url,host, 109 | referer,server_application,content_type,dst_port,corrupt,file_size) 110 | VALUES 111 | (%s,%s,TO_TIMESTAMP(%s),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""", 112 | (sha1, md5, timestamp, srcip, dstip, method, url, host, referer, server, 113 | cont_type, dst_port, corrupt_pe, file_size)) 114 | cursor.execute(""" 115 | SELECT dump_id FROM pe_dumps where sha1 = %s ORDER BY dump_id DESC 116 | """, (sha1,)) 117 | dump_id = cursor.fetchone()[0] 118 | print ("A new entry on host:%s has been made in pe_dumps table with " 119 | "dump_id %s" % (host, dump_id)) 120 | 121 | fileHandle.close() 122 | cursor.close() 123 | conn.close() 124 | return dump_id, corrupt_pe 125 | 126 | 127 | if __name__ == "__main__": 128 | file_path = sys.argv[1] 129 | sha1 = sys.argv[2] 130 | md5 = sys.argv[3] 131 | file_size = sys.argv[4] 132 | db_pe_dumps(file_path, sha1, md5, file_size) 133 | -------------------------------------------------------------------------------- /file_dump/seq_list.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Roberto Perdisci (perdisci@cs.uga.edu) 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | 18 | #include "seq_list.h" 19 | 20 | seq_list_t *seq_list_init(void) { 21 | 22 | seq_list_t *l = (seq_list_t*)malloc(sizeof(seq_list_t)); 23 | if(l == NULL) { 24 | printf("Failed to initialize seq_list! Out of memory???\n"); 25 | fflush(stdout); 26 | exit(1); 27 | } 28 | 29 | memset(l,0,sizeof(seq_list_t)); 30 | l->head = NULL; 31 | l->tail = NULL; 32 | l->next = NULL; 33 | 34 | return l; 35 | } 36 | 37 | void seq_list_destroy(seq_list_t* l, int mz_found) { 38 | 39 | /* DEBUG 40 | if(mz_found) { 41 | printf("Calling seq_list_destroy!!!\n"); 42 | fflush(stdout); 43 | } 44 | */ 45 | 46 | 47 | if(l == NULL) 48 | return; 49 | 50 | seq_list_entry_t *h = l->head; 51 | seq_list_entry_t *n; 52 | 53 | while(h != NULL) { 54 | n = h->next; 55 | free(h); 56 | h = n; 57 | } 58 | 59 | l->head = NULL; 60 | l->tail = NULL; 61 | l->next = NULL; 62 | 63 | free(l); 64 | 65 | 66 | /* DEBUG 67 | if(mz_found) { 68 | printf("Destroyed seq_list!!!\n"); 69 | fflush(stdout); 70 | } 71 | */ 72 | } 73 | 74 | void seq_list_insert(seq_list_t *l, u_int sn, u_int ps) { 75 | 76 | if(l == NULL) 77 | return; 78 | 79 | seq_list_entry_t *e = (seq_list_entry_t*)malloc(sizeof(seq_list_entry_t)); 80 | if(e == NULL) { 81 | printf("Error allocating memory for insering element in seq_list; Out of memory???\n"); 82 | fflush(stdout); 83 | exit(1); 84 | } 85 | 86 | // initialize the new element 87 | memset(e,0,sizeof(seq_list_entry_t)); 88 | e->sn = sn; 89 | e->ps = ps; 90 | e->next = NULL; 91 | 92 | if(l->head == NULL) { 93 | l->head = e; 94 | l->tail = e; 95 | l->next = e; 96 | 97 | return; 98 | } 99 | 100 | if(l->tail == NULL) { 101 | printf("Error: list tail cannot be null here!\n"); 102 | fflush(stdout); 103 | exit(1); 104 | } 105 | l->tail->next = e; 106 | l->tail = e; 107 | 108 | } 109 | 110 | seq_list_entry_t *seq_list_head(seq_list_t *l) { 111 | 112 | if(l == NULL) 113 | return NULL; 114 | 115 | return l->head; 116 | } 117 | 118 | seq_list_entry_t *seq_list_tail(seq_list_t *l) { 119 | 120 | if(l == NULL) 121 | return NULL; 122 | 123 | return l->tail; 124 | } 125 | 126 | seq_list_entry_t *seq_list_next(seq_list_t *l) { 127 | 128 | if(l == NULL) 129 | return NULL; 130 | 131 | if(l->next == NULL) 132 | return NULL; 133 | 134 | seq_list_entry_t *n = l->next; 135 | l->next = l->next->next; 136 | 137 | return n; 138 | 139 | } 140 | 141 | void seq_list_restart_from_head(seq_list_t *l) { 142 | 143 | if(l == NULL) 144 | return; 145 | 146 | l->next = l->head; 147 | 148 | } 149 | 150 | void seq_list_restart_from_element(seq_list_t *l, seq_list_entry_t *e) { 151 | 152 | if(l == NULL) 153 | return; 154 | 155 | l->next = e; 156 | 157 | } 158 | 159 | u_int seq_list_get_seq_num(seq_list_entry_t *e) { 160 | 161 | if(e == NULL) 162 | return 0; 163 | return e->sn; 164 | } 165 | 166 | 167 | u_int seq_list_get_payload_size(seq_list_entry_t *e) { 168 | 169 | if(e == NULL) 170 | return 0; 171 | return e->ps; 172 | } 173 | 174 | 175 | void seq_list_print(seq_list_t *l) { 176 | 177 | if(l == NULL) 178 | return; 179 | 180 | seq_list_entry_t *e = l->head; 181 | while(e != NULL) { 182 | printf("(%u,%u) ", e->sn, e->ps); 183 | e = e->next; 184 | } 185 | printf("\n"); 186 | 187 | } 188 | 189 | /* For debugging purposes */ 190 | /** 191 | int main(void) { 192 | 193 | seq_list_t *l = seq_list_init(); 194 | 195 | seq_list_insert(l,1,10); 196 | seq_list_insert(l,5,8); 197 | seq_list_insert(l,11,100); 198 | seq_list_insert(l,45,190); 199 | 200 | seq_list_print(l); 201 | 202 | seq_list_destroy(l); 203 | 204 | return 0; 205 | 206 | } 207 | **/ 208 | -------------------------------------------------------------------------------- /amico_scripts/manual_download.py: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | # Copyright (C) 2011 Phani Vadrevu, Roberto Perdisci # 3 | # pvadrevu@uga.edu # 4 | # perdisci@cs.uga.edu # 5 | # # 6 | # Distributed under the GNU Public License # 7 | # http://www.gnu.org/licenses/gpl.txt # 8 | # # 9 | # This program is free software; you can redistribute it and/or modify # 10 | # it under the terms of the GNU General Public License as published by # 11 | # the Free Software Foundation; either version 2 of the License, or # 12 | # (at your option) any later version. # 13 | # # 14 | ########################################################################### 15 | 16 | import sys 17 | import re 18 | import time 19 | import hashlib 20 | from struct import unpack 21 | from config import capture_file_types 22 | from extract_file import extract_file_type 23 | 24 | import urllib2 25 | 26 | import util 27 | from config import MAN_DOWNLOAD_DIR 28 | 29 | USER_AGENT = "Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)" 30 | HTTP_TIMEOUT = 40 # HTTP Request timeout 31 | 32 | 33 | # Take the request, download the file and generate sha1 and md5 hashes 34 | # When the file is a valid pe and different from previous, then, save 35 | # it to the downloads directory 36 | def download_file(dump_id, req, captured_sha1): 37 | # Make the request 38 | try: 39 | res = urllib2.urlopen(req, timeout=HTTP_TIMEOUT).read() 40 | except urllib2.URLError, e: 41 | res = None 42 | print "Error making the manual download", e 43 | 44 | sha1 = None 45 | md5 = None 46 | is_interesting_file = None 47 | 48 | if res is None: 49 | print "Executable could not be downloaded manually" 50 | else: 51 | file_type = extract_file_type(res) 52 | if file_type in capture_file_types: 53 | print "Manually downloaded", file_type, "file" 54 | sha1 = hashlib.sha1(res).hexdigest() 55 | 56 | # Store the downloaded file in a sub directory as md5.exe 57 | md5 = hashlib.md5(res).hexdigest() 58 | 59 | download_file = open(MAN_DOWNLOAD_DIR + "/" + md5 + "." + file_type, "w") 60 | download_file.write(res) 61 | download_file.close() 62 | print "Written " + MAN_DOWNLOAD_DIR + "/" + md5 + "." + file_type 63 | is_interesting_file = True 64 | else: 65 | print "Manually downloaded an uninteresting file!" 66 | is_interesting_file = False 67 | 68 | if captured_sha1 != sha1: 69 | different = True 70 | print "Checksums did not match for dump_id: ", dump_id 71 | print captured_sha1, "!=", sha1 72 | else: 73 | different = False 74 | 75 | return sha1, md5, different, is_interesting_file 76 | 77 | 78 | def manual_download(captured_sha1): 79 | util.setup_socks() 80 | conn = util.connect_to_db() 81 | cursor = conn.cursor() 82 | 83 | # Database query to get the relevant recent record 84 | cursor.execute(""" 85 | SELECT dump_id,host,url,referer,client,server FROM pe_dumps WHERE sha1 = %s 86 | ORDER BY timestamp DESC;""", (captured_sha1,)) 87 | row = cursor.fetchone() 88 | dump_id = row[0] 89 | host = row[1] 90 | url = row[2] 91 | referer = row[3] 92 | client = row[4] 93 | server = row[5] 94 | 95 | full_url = "http://" 96 | ordered_host = server # if host is null, we use ther server IP 97 | if host: 98 | ordered_host = util.reorder_domain(host) 99 | full_url += ordered_host 100 | if url: 101 | full_url += url 102 | print "Starting manual download from :", full_url 103 | 104 | # Prepare the urllib2 request 105 | req = urllib2.Request(full_url) 106 | req.add_header("User-Agent", USER_AGENT) 107 | 108 | download_time = time.time() 109 | sha1, md5, different, is_interesting_file = download_file(dump_id, req, captured_sha1) 110 | 111 | # Database statement 112 | cursor.execute(""" 113 | INSERT INTO manual_download_checksums(dump_id, sha1, 114 | md5, different, referer_exists, timestamp, is_pe) 115 | VALUES (%s, %s, %s, %s, %s, TO_TIMESTAMP(%s), %s)""", 116 | (dump_id, sha1, md5, different, False, download_time, is_interesting_file)) 117 | 118 | cursor.close() 119 | conn.close() 120 | 121 | if __name__ == "__main__": 122 | manual_download(sys.argv[1]) 123 | -------------------------------------------------------------------------------- /amico_scripts/db_file_dumps.py: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | # Copyright (C) 2014 Phani Vadrevu, Roberto Perdisci # 3 | # pvadrevu@uga.edu # 4 | # # 5 | # Distributed under the GNU Public License # 6 | # http://www.gnu.org/licenses/gpl.txt # 7 | # # 8 | # This program is free software; you can redistribute it and/or modify # 9 | # it under the terms of the GNU General Public License as published by # 10 | # the Free Software Foundation; either version 2 of the License, or # 11 | # (at your option) any later version. # 12 | # # 13 | ########################################################################### 14 | 15 | import re 16 | import sys 17 | from config import * 18 | 19 | import util 20 | 21 | def db_file_dumps(file_path, sha1, md5, file_size, file_type): 22 | #print "Time b4 http parsing: %f" %(time.time(),) 23 | # Use Autocommit mode for database connection 24 | conn = util.connect_to_db() 25 | cursor = conn.cursor() 26 | 27 | fileHandle = open(file_path) 28 | 29 | # Timestamp 30 | r = re.compile('[0-9]+') 31 | timestamp = r.search(fileHandle.readline()) 32 | if timestamp is not None: 33 | timestamp = timestamp.group() 34 | #print timestamp.group() 35 | 36 | # Source and Destination IPs 37 | r = re.compile('([0-9.]+):.*-([0-9.]+):([0-9]+)-.*') 38 | ip = r.search(fileHandle.readline()) 39 | if ip is not None: 40 | srcip = ip.group(2) 41 | dstip = ip.group(1) 42 | dst_port = ip.group(3) 43 | #print ip.group(1) 44 | #print ip.group(2) 45 | else: 46 | srcip = None 47 | dstip = None 48 | dst_port = None 49 | 50 | # URL 51 | r = re.compile('(GET|POST|HEAD) (.*)') 52 | url = r.search(fileHandle.readline()) 53 | if url is not None: 54 | method = url.group(1) 55 | method = method[:10] 56 | url = url.group(2) 57 | toks = url.split() 58 | url = toks[0] 59 | #print url.group(1) 60 | else: 61 | method = None 62 | 63 | 64 | # Host 65 | r = re.compile('Host: (.*)') 66 | host = r.search(fileHandle.readline()) 67 | if host is not None: 68 | host = host.group(1) 69 | host = util.reorder_domain(host.strip()) 70 | #print host.group(1) 71 | 72 | 73 | # Referer 74 | r = re.compile('Referer: (.*)') 75 | referer = r.search(fileHandle.readline()) 76 | if referer is not None: 77 | referer = referer.group(1) 78 | #print referrer.group(1) 79 | 80 | 81 | # CORRUPT_PE 82 | corrupt_pe = False 83 | r = re.compile('CORRUPT_FILE') 84 | corrupt_pe_str = r.search(fileHandle.readline()) 85 | if corrupt_pe_str is not None: 86 | corrupt_pe = True 87 | 88 | 89 | # Now, parse data from the response 90 | # Server 91 | data = fileHandle.read() 92 | r = re.compile('Server: (.*)') 93 | server = r.search(data) 94 | if server is not None: 95 | server = server.group(1) 96 | server = server.rstrip('\r') 97 | server = server[:64] 98 | 99 | # Content-Type 100 | r = re.compile('Content-Type: (.*)') 101 | cont_type = r.search(data) 102 | if cont_type is not None: 103 | cont_type = cont_type.group(1) 104 | cont_type = cont_type.rstrip('\r') 105 | cont_type = cont_type[:128] 106 | 107 | #print "Time after http parsing: %f" %(time.time(),) 108 | # Database statement 109 | cursor.execute(""" 110 | INSERT INTO pe_dumps(sha1,md5,timestamp,server,client,method,url,host, 111 | referer,server_application,content_type,dst_port,corrupt,file_size,file_type) 112 | VALUES 113 | (%s,%s,TO_TIMESTAMP(%s),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""", 114 | (sha1, md5, timestamp, srcip, dstip, method, url, host, referer, server, 115 | cont_type, dst_port, corrupt_pe, file_size, file_type)) 116 | cursor.execute(""" 117 | SELECT dump_id FROM pe_dumps where sha1 = %s ORDER BY dump_id DESC LIMIT 1 118 | """, (sha1,)) 119 | dump_id = cursor.fetchone()[0] 120 | print ("A new entry on host:%s has been made in pe_dumps table with " 121 | "dump_id %s" % (host, dump_id)) 122 | 123 | fileHandle.close() 124 | cursor.close() 125 | conn.close() 126 | return dump_id, corrupt_pe, host, dstip, srcip 127 | 128 | 129 | if __name__ == "__main__": 130 | file_path = sys.argv[1] 131 | sha1 = sys.argv[2] 132 | md5 = sys.argv[3] 133 | file_size = sys.argv[4] 134 | file_type = sys.argv[5] 135 | db_file_dumps(file_path, sha1, md5, file_size, file_type) 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /amico_scripts/ip2asn.py: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | # Copyright (C) 2011 Phani Vadrevu # 3 | # pvadrevu@uga.edu # 4 | # # 5 | # Distributed under the GNU Public License # 6 | # http://www.gnu.org/licenses/gpl.txt # 7 | # # 8 | # This program is free software; you can redistribute it and/or modify # 9 | # it under the terms of the GNU General Public License as published by # 10 | # the Free Software Foundation; either version 2 of the License, or # 11 | # (at your option) any later version. # 12 | # # 13 | ########################################################################### 14 | 15 | import sys 16 | import psycopg2 17 | import socks 18 | import socket 19 | import time 20 | import subprocess 21 | 22 | import util 23 | from config import * 24 | 25 | 26 | USER_AGENT = "Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)" 27 | CYMRU_TIMEOUT = 1 # Timeout for cymru dig call 28 | 29 | 30 | def ip2asn(dump_id): 31 | # Connect to database 32 | try: 33 | conn = psycopg2.connect("dbname=%s host=%s user=%s password=%s" 34 | % (db_name, db_host, db_user, db_password)) 35 | except: 36 | print "Unable to connect to database: " + db_name 37 | 38 | # Use Autocommit mode for database connection 39 | conn.set_isolation_level(0) 40 | cursor = conn.cursor() 41 | 42 | # Setup SOCKS proxy 43 | if socks_proxy_host: 44 | socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, 45 | socks_proxy_host, socks_proxy_port) 46 | socket.socket = socks.socksocket 47 | ### 48 | 49 | # Database query to get the relevant recent record 50 | cursor.execute(""" 51 | SELECT server, timestamp FROM pe_dumps WHERE dump_id = %s 52 | """, (dump_id,)) 53 | row = cursor.fetchone() 54 | server_ip = row[0] 55 | # Exit if an AS containing this IP has been logged with in the last 1 month 56 | cursor.execute(""" 57 | SELECT * FROM bgp2asn WHERE log_date > (current_date - interval '1 month') 58 | AND bgp_prefix >> %s """, (server_ip,)) 59 | if cursor.rowcount > 0: 60 | return 61 | 62 | # Query whois.cymru.com 63 | #cmd = subprocess.Popen(['whois','-h','whois.cymru.com','-v', 64 | # server_ip], stdout = subprocess.PIPE) 65 | #as_info = cmd.stdout 66 | #for line in as_info: 67 | # if(server_ip in line): 68 | # output = line.split('|') 69 | # break 70 | #words=[] 71 | #for word in output: 72 | # words.append(word.strip()) 73 | 74 | # Query asn.cymru.com using dig 75 | # A sample output is: 76 | # "701 1239 3549 3561 7132 | 216.90.108.0/24 | US | arin | 1998-09-25" 77 | print "making call" 78 | cmd = subprocess.Popen(['dig', '+short', util.reverse_ip(server_ip) + 79 | '.origin.asn.cymru.com', 'TXT'], stdout=subprocess.PIPE) 80 | time.sleep(CYMRU_TIMEOUT) 81 | if cmd.poll() is None: 82 | cmd.kill() 83 | return 84 | as_info = cmd.stdout.readline() 85 | as_info = as_info.strip().strip('"') 86 | output = as_info.split('|') 87 | words = [] 88 | for answer in output: 89 | if answer: 90 | words.append(answer.split()[0].strip()) 91 | else: 92 | words.append(None) 93 | 94 | #print words 95 | as_number = words[0] 96 | bgp_prefix = words[1] 97 | country_code = words[2] 98 | date_allocated = words[4] 99 | 100 | # Sample output: 101 | # "23028 | US | arin | 2002-01-04 | TEAMCYMRU - SAUNET" 102 | cmd = subprocess.Popen(['dig', '+short', 'AS' + as_number + '.asn.cymru.com', 103 | 'TXT'], stdout=subprocess.PIPE) 104 | time.sleep(CYMRU_TIMEOUT) 105 | if cmd.poll() is None: 106 | cmd.kill() 107 | print ("ip2asn.py: Couldn't finish the call to cymru for {0}. Aborting..." 108 | .format((server_ip,))) 109 | return 110 | as_info = cmd.stdout.readline() 111 | as_info = as_info.strip().strip('"') 112 | output = as_info.split('|') 113 | words = [] 114 | for word in output: 115 | words.append(word.strip()) 116 | print words 117 | as_name = words[4] 118 | 119 | # Store the record in the database 120 | cursor.execute(""" 121 | INSERT INTO 122 | bgp2asn 123 | (bgp_prefix, as_number, as_name, country_code, 124 | date_allocated, log_date) 125 | VALUES (%s,%s,%s,%s,%s,current_date)""" 126 | , (bgp_prefix, as_number, as_name, country_code, 127 | date_allocated)) 128 | 129 | cursor.close() 130 | conn.close() 131 | 132 | if __name__ == "__main__": 133 | ip2asn(sys.argv[1]) 134 | -------------------------------------------------------------------------------- /amico_scripts/db_setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ########################################################################### 3 | # Copyright (C) 2011 Phani Vadrevu # 4 | # pvadrevu@uga.edu # 5 | # # 6 | # Distributed under the GNU Public License # 7 | # http://www.gnu.org/licenses/gpl.txt # 8 | # # 9 | # This program is free software; you can redistribute it and/or modify # 10 | # it under the terms of the GNU General Public License as published by # 11 | # the Free Software Foundation; either version 2 of the License, or # 12 | # (at your option) any later version. # 13 | # # 14 | ########################################################################### 15 | 16 | import psycopg2 17 | import os 18 | import config 19 | import re 20 | 21 | from config import * 22 | from fe_db_setup import fe_db_setup 23 | 24 | # Reorder the subdomains in the host name such that 25 | # the TLD comes first. Eg: com.google.www 26 | def reorder_domain(host): 27 | host = host.split(':')[0] # in case host string contains port 28 | 29 | ordered_host = "" 30 | host += '.' 31 | domains = re.findall('.*?\.',host) 32 | for i in range(len(domains)): 33 | ordered_host += domains[len(domains)-i-1] 34 | ordered_host = ordered_host[:-1] 35 | return ordered_host 36 | 37 | # Connect to database 38 | try: 39 | conn = psycopg2.connect("dbname=%s host=%s user=%s password=%s" 40 | %(db_name,db_host,db_user,db_password)) 41 | except: 42 | print "Unable to connect to database: "+db_name 43 | 44 | conn.set_isolation_level(0) 45 | cursor = conn.cursor() 46 | 47 | try: 48 | cursor.execute(""" 49 | CREATE TABLE pe_dumps( dump_id SERIAL,PRIMARY KEY(dump_id), 50 | sha1 VARCHAR(40),md5 VARCHAR(32),timestamp TIMESTAMP, server INET, 51 | client INET,method VARCHAR(10),url VARCHAR(512),host VARCHAR(256), 52 | referer VARCHAR(512),server_application VARCHAR(64), 53 | content_type VARCHAR(128),dst_port INT,corrupt BOOLEAN, 54 | file_size INT,file_type VARCHAR(5)) 55 | """) 56 | except psycopg2.DatabaseError as e: 57 | print e 58 | try: 59 | cursor.execute("CREATE INDEX pd_sha1_index ON pe_dumps(sha1)") 60 | except psycopg2.DatabaseError as e: 61 | print e 62 | try: 63 | cursor.execute("CREATE INDEX pd_md5_index ON pe_dumps(md5)") 64 | except psycopg2.DatabaseError as e: 65 | print e 66 | try: 67 | cursor.execute("CREATE INDEX pd_host_index ON pe_dumps(host)") 68 | except psycopg2.DatabaseError as e: 69 | print e 70 | try: 71 | cursor.execute(""" 72 | CREATE TABLE virus_total_scans(vt_id SERIAL,PRIMARY KEY(vt_id), 73 | sha1 VARCHAR(40),md5 VARCHAR(32),json TEXT,num_av_labels INT, 74 | trusted_av_labels INT,scan_time TIMESTAMP,query_time TIMESTAMP, 75 | first_seen TIMESTAMP) 76 | """) 77 | except psycopg2.DatabaseError as e: 78 | print e 79 | 80 | try: 81 | cursor.execute(""" 82 | CREATE TABLE virus_total_submissions( 83 | vt_submit_id SERIAL, 84 | PRIMARY KEY(vt_submit_id), 85 | submit_time TIMESTAMP, 86 | sha1 VARCHAR(40), 87 | md5 VARCHAR(32), 88 | json TEXT, 89 | num_av_labels INT, 90 | trusted_av_labels INT, 91 | scan_time TIMESTAMP, 92 | scan_id VARCHAR(75), 93 | resubmit_id INT REFERENCES virus_total_submissions(vt_submit_id)) 94 | """) 95 | except psycopg2.DatabaseError as e: 96 | print e 97 | try: 98 | cursor.execute(""" 99 | CREATE TABLE ped_vts_mapping (dump_id INT REFERENCES pe_dumps(dump_id), 100 | vt_id INT REFERENCES virus_total_scans(vt_id)) 101 | """) 102 | except psycopg2.DatabaseError as e: 103 | print e 104 | 105 | try: 106 | cursor.execute("CREATE INDEX vt_sha1_index ON virus_total_scans(sha1)") 107 | except psycopg2.DatabaseError as e: 108 | print e 109 | try: 110 | cursor.execute("CREATE INDEX vt_md5_index ON virus_total_scans(md5)") 111 | except psycopg2.DatabaseError as e: 112 | print e 113 | 114 | try: 115 | cursor.execute(""" 116 | CREATE TABLE manual_download_checksums(dump_id INT REFERENCES pe_dumps(dump_id), 117 | sha1 VARCHAR(40), md5 VARCHAR(32), different BOOLEAN, referer_exists BOOLEAN, 118 | timestamp TIMESTAMP, is_pe BOOLEAN); 119 | """) 120 | except psycopg2.DatabaseError as e: 121 | print e 122 | 123 | try: 124 | cursor.execute(""" 125 | CREATE TABLE bgp2asn(bgp_prefix INET, as_number INT, as_name VARCHAR(512), 126 | country_code VARCHAR(2), date_allocated DATE, log_date DATE) 127 | """) 128 | except psycopg2.DatabaseError as e: 129 | print e 130 | 131 | try: 132 | cursor.execute(""" 133 | CREATE TABLE amico_scores( 134 | dump_id INT PRIMARY KEY REFERENCES pe_dumps(dump_id), 135 | score REAL) 136 | """) 137 | except psycopg2.DatabaseError as e: 138 | print e 139 | 140 | print("""Created tables: pe_dumps, virus_total_scans, manual_download_checksums, 141 | bgp2asn, amico_scores""") 142 | 143 | fe_db_setup() 144 | cursor.close() 145 | conn.close() 146 | -------------------------------------------------------------------------------- /amico_scripts/db_virus_total.py: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | # Copyright (C) 2011 Phani Vadrevu and Roberto Perdisci # 3 | # pvadrevu@uga.edu # 4 | # # 5 | # Distributed under the GNU Public License # 6 | # http://www.gnu.org/licenses/gpl.txt # 7 | # # 8 | # This program is free software; you can redistribute it and/or modify # 9 | # it under the terms of the GNU General Public License as published by # 10 | # the Free Software Foundation; either version 2 of the License, or # 11 | # (at your option) any later version. # 12 | # # 13 | ########################################################################### 14 | 15 | import sys 16 | from datetime import datetime, timedelta, MINYEAR 17 | import time 18 | 19 | import simplejson 20 | import logging 21 | import logging.config 22 | 23 | import util 24 | import vt_api 25 | from config import trusted_av_vendors 26 | 27 | LOG_CONF_FILE = "logging.conf" 28 | # Do not make a new query for the same sha1 if the previous query was made 29 | # with in VT_QUERY_INTERVAL (in days) 30 | VT_QUERY_INTERVAL = 1 31 | MAX_TRIES = 3 32 | 33 | 34 | def insert_report(cursor, report, sha1, md5, json, dump_id): 35 | scan_time = report["scan_date"] 36 | scans = report["scans"] 37 | num_av_labels = report["positives"] 38 | trusted_av_labels = 0 39 | for k, v in scans.iteritems(): 40 | if v["detected"] is True: 41 | if k in trusted_av_vendors: 42 | trusted_av_labels += 1 43 | scan_time += " UTC" 44 | cursor.execute(""" 45 | INSERT INTO virus_total_scans(sha1,md5,json,num_av_labels, 46 | trusted_av_labels,scan_time,query_time) 47 | VALUES (%s,%s,%s,%s,%s,TIMESTAMP WITH TIME ZONE %s, 48 | CLOCK_TIMESTAMP()) 49 | RETURNING vt_id 50 | """, (sha1, md5, json, num_av_labels, 51 | trusted_av_labels, scan_time)) 52 | vt_id = cursor.fetchone()[0] 53 | 54 | cursor.execute(""" 55 | INSERT INTO ped_vts_mapping (dump_id, vt_id) 56 | VALUES (%s, %s)""", 57 | (dump_id, vt_id)) 58 | print "Virus Total: Scan report found. Entry has been made into" 59 | print "virus_total_scans table" 60 | 61 | 62 | def db_virus_total(dump_id): 63 | logging.config.fileConfig(LOG_CONF_FILE) 64 | logger = logging.getLogger("amico_logger") 65 | util.setup_socks() 66 | conn = util.connect_to_db() 67 | cursor = conn.cursor() 68 | 69 | # Exit if this sha1 has been queried in the past VT_QUERY_INTERVAL period 70 | prev_query_time = datetime(MINYEAR, 1, 1, 0, 0, 0, 0) 71 | time_now = datetime.now() 72 | cursor.execute(""" 73 | SELECT sha1, md5 74 | FROM pe_dumps 75 | WHERE dump_id = %s""", 76 | (dump_id,)) 77 | (sha1, md5) = cursor.fetchone() 78 | 79 | try: 80 | cursor.execute("SELECT query_time, vt_id FROM virus_total_scans " 81 | "WHERE sha1 = %s " 82 | "ORDER by query_time DESC", (sha1,)) 83 | res = cursor.fetchone() 84 | if res: 85 | prev_query_time = res[0] 86 | vt_id = res[1] 87 | except: 88 | print "sha1:%s no previous VT query" % (sha1, ) 89 | pass 90 | 91 | vt_query_period = timedelta(days=VT_QUERY_INTERVAL) 92 | if (time_now - prev_query_time) < vt_query_period: 93 | print "sha1:%s has been queried recently. Skipping..." % (sha1, ) 94 | cursor.execute(""" 95 | INSERT INTO ped_vts_mapping (dump_id, vt_id) 96 | VALUES (%s, %s)""", 97 | (dump_id, vt_id)) 98 | conn.close() 99 | return 100 | 101 | tries = 0 102 | success = False 103 | while tries < MAX_TRIES: 104 | try: 105 | tries += 1 106 | json = vt_api.get_vt_report(md5) 107 | if not json: 108 | continue 109 | report = simplejson.loads(json) 110 | if report["response_code"] == 1: 111 | insert_report(cursor, report, sha1, md5, json, dump_id) 112 | success = True 113 | break 114 | elif report["response_code"] == 0: 115 | cursor.execute(""" 116 | INSERT INTO virus_total_scans(sha1, md5, query_time) 117 | VALUES (%s, %s, CLOCK_TIMESTAMP()) 118 | RETURNING vt_id 119 | """, (sha1, md5)) 120 | vt_id = cursor.fetchone()[0] 121 | cursor.execute(""" 122 | INSERT INTO ped_vts_mapping (dump_id, vt_id) 123 | VALUES (%s, %s)""", 124 | (dump_id, vt_id)) 125 | print "Virus Total: No scan report exists in the VT database" 126 | success = True 127 | break 128 | else: 129 | logger.exception("Unknown response code! %s" % 130 | (report["response_code"],)) 131 | time.sleep(1) 132 | 133 | except Exception as e: 134 | print e 135 | logger.exception("Try %s. Error in fetching report for md5 %s: %s" 136 | % (tries, md5, e)) 137 | time.sleep(5) 138 | if not success: 139 | cursor.execute(""" 140 | INSERT INTO ped_vts_mapping (dump_id) 141 | VALUES (%s)""", 142 | (dump_id,)) 143 | logger.warning("Giving up on dump_id: %s's VT report" % (dump_id,)) 144 | cursor.close() 145 | conn.close() 146 | 147 | if __name__ == "__main__": 148 | db_virus_total(sys.argv[1]) 149 | -------------------------------------------------------------------------------- /amico_scripts/extract_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ########################################################################### 4 | # Copyright (C) 2011 Roberto Perdisci # 5 | # perdisci@cs.uga.edu # 6 | # # 7 | # Distributed under the GNU Public License # 8 | # http://www.gnu.org/licenses/gpl.txt # 9 | # # 10 | # This program is free software; you can redistribute it and/or modify # 11 | # it under the terms of the GNU General Public License as published by # 12 | # the Free Software Foundation; either version 2 of the License, or # 13 | # (at your option) any later version. # 14 | # # 15 | ########################################################################### 16 | 17 | import sys, os 18 | import re 19 | from struct import unpack 20 | from config import capture_file_types 21 | 22 | def prune_http_resp_headers(data): 23 | # finds start of resp header 24 | m = re.search("HTTP/\d\.\d\s\d\d\d", data) 25 | if m: 26 | pos = m.start() 27 | data = data[pos:] 28 | 29 | # now we can search for the end of the response header 30 | m = re.search('\r\n\r\n',data) 31 | if m: 32 | pos = m.start() 33 | return data[pos+4:] # returns all data after \r\n\r\n 34 | 35 | 36 | def is_pe_file(bin_data): 37 | if bin_data[0:2] == 'MZ': 38 | offset = unpack('i', bin_data[0x3c:0x3c+4])[0] 39 | if bin_data[offset:offset+2] == 'PE': 40 | # print "This is a PE file!" 41 | return True 42 | 43 | # print "This is NOT a PE file!" 44 | return False 45 | 46 | 47 | def is_jar_file(bin_data): 48 | if bin_data[0:4].encode('hex').upper() == '504B0304': 49 | # print "Searching for manifest.mf" 50 | regex = re.compile('MANIFEST.MF',re.IGNORECASE) 51 | m = regex.search(bin_data) 52 | if m: 53 | # print "Found manifest!" 54 | return True 55 | 56 | 57 | def is_apk_file(bin_data): 58 | if bin_data[0:4].encode('hex').upper() == '504B0304': 59 | # print "Searching for AndroidManifest.xml" 60 | regex = re.compile('AndroidManifest.xml',re.IGNORECASE) 61 | m = regex.search(bin_data) 62 | if m: 63 | # print "Found Android Manifest!" 64 | return True 65 | 66 | 67 | def is_elf_file(bin_data): 68 | if bin_data[0].encode('hex').upper() == '7F': 69 | if bin_data[1:4] == 'ELF': 70 | return True 71 | return False 72 | 73 | 74 | def is_pdf_file(bin_data): 75 | if bin_data[0:4] == '%PDF': 76 | return True 77 | return False 78 | 79 | 80 | def is_rar_file(bin_data): 81 | if bin_data[0:4] == 'Rar!': 82 | return True 83 | return False 84 | 85 | 86 | def is_zip_file(bin_data): 87 | if bin_data[0:4].encode('hex').upper() == '504B0304': 88 | return True 89 | return False 90 | 91 | 92 | def is_swf_file(bin_data): 93 | magicstr = bin_data[0:3].encode('hex') 94 | if magicstr == '465753' or magicstr == '435753' or magicstr == '5A5753': 95 | return True 96 | return False 97 | 98 | 99 | def is_msdoc_file(bin_data): 100 | # msdocx_magic[] = {0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x06, 0x00}; 101 | # msdoc_magic[] = {0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1}; 102 | magicstr = bin_data[0:8].encode('hex') 103 | if magicstr == '504B030414000600': 104 | return True 105 | if magicstr == 'D0CF11E0A1B11AE1': 106 | return True 107 | return False 108 | 109 | 110 | def is_dmg_file(bin_data): 111 | 112 | magicstr = bin_data[0:1].encode('hex') 113 | if magicstr == '78' or bin_data[0:3] == 'BZh': 114 | regex = re.compile('koly',re.IGNORECASE) 115 | m = regex.search(bin_data) 116 | if m: 117 | # print "Found koly!" 118 | return True 119 | return False 120 | 121 | 122 | def extract_file_type(data): 123 | 124 | file_type = None 125 | 126 | if not file_type and is_pe_file(data): 127 | file_type = "EXE" 128 | 129 | if not file_type and is_jar_file(data): 130 | file_type = "JAR" 131 | 132 | if (not file_type or file_type=="JAR") and is_apk_file(data): 133 | file_type = "APK" 134 | 135 | if not file_type and is_elf_file(data): 136 | file_type = "ELF" 137 | 138 | if not file_type and is_dmg_file(data): 139 | file_type = "DMG" 140 | 141 | if not file_type and is_msdoc_file(data): 142 | file_type = "MSDOC" 143 | 144 | if not file_type and is_rar_file(data): 145 | file_type = "RAR" 146 | 147 | if not file_type and is_swf_file(data): 148 | file_type = "SWF" 149 | 150 | if not file_type and is_pdf_file(data): 151 | file_type = "PDF" 152 | 153 | if not file_type and is_zip_file(data): 154 | # notice that this is more generic than other 155 | # derived file formats (e.g., JAR, DOCX, etc.) 156 | # and therefore this check should run last! 157 | file_type = "ZIP" 158 | 159 | return file_type 160 | 161 | 162 | 163 | def extract_file(flow_file, dst=None): 164 | 165 | if not dst: 166 | dst = flow_file 167 | 168 | f = open(flow_file, 'rb') 169 | data = f.read() 170 | f.close() 171 | 172 | data = prune_http_resp_headers(data) 173 | 174 | file_type = None 175 | file_extension = '' 176 | 177 | if not file_type and is_pe_file(data): 178 | file_type = "EXE" 179 | file_extension = "exe" 180 | 181 | if not file_type and is_jar_file(data): 182 | file_type = "JAR" 183 | file_extension = "jar" 184 | 185 | if (not file_type or file_type=="JAR") and is_apk_file(data): 186 | file_type = "APK" 187 | file_extension = "apk" 188 | 189 | if not file_type and is_elf_file(data): 190 | file_type = "ELF" 191 | file_extension = "elf" 192 | 193 | if not file_type and is_dmg_file(data): 194 | file_type = "DMG" 195 | file_extension = "dmg" 196 | 197 | if not file_type and is_msdoc_file(data): 198 | file_type = "MSDOC" 199 | file_extension = "msdoc" # generic for DOC(X), PPT(X), XLS(X), etc. 200 | 201 | if not file_type and is_rar_file(data): 202 | file_type = "RAR" 203 | file_extension = "rar" 204 | 205 | if not file_type and is_swf_file(data): 206 | file_type = "SWF" 207 | file_extension = "swf" 208 | 209 | if not file_type and is_pdf_file(data): 210 | file_type = "PDF" 211 | file_extension = "pdf" 212 | 213 | if not file_type and is_zip_file(data): 214 | # notice that this is more generic than other 215 | # derived file formats (e.g., JAR, DOCX, etc.) 216 | # and therefore this check should run last! 217 | file_type = "ZIP" 218 | file_extension = "zip" 219 | 220 | if file_type in capture_file_types: 221 | dst = dst+'.'+file_extension 222 | print "Writing file:", dst 223 | f = open(dst, 'wb') 224 | f.write(data) 225 | f.close() 226 | print "Finished!" 227 | return (file_type, dst, file_extension) 228 | 229 | return(None, None, None) 230 | 231 | 232 | 233 | if __name__ == '__main__': 234 | extract_file(sys.argv[1]) 235 | 236 | 237 | 238 | -------------------------------------------------------------------------------- /amico_scripts/start_amico.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | ########################################################################### 4 | # Copyright (C) 2014 Phani Vadrevu and Roberto Perdisci # 5 | # pvadrevu@uga.edu # 6 | # perdisci@uga.edu # 7 | # # 8 | # Distributed under the GNU Public License # 9 | # http://www.gnu.org/licenses/gpl.txt # 10 | # # 11 | # This program is free software; you can redistribute it and/or modify # 12 | # it under the terms of the GNU General Public License as published by # 13 | # the Free Software Foundation; either version 2 of the License, or # 14 | # (at your option) any later version. # 15 | # # 16 | ########################################################################### 17 | from multiprocessing import Process 18 | import shutil 19 | import os 20 | import subprocess 21 | import hashlib 22 | import time 23 | import traceback 24 | from cachetools import TTLCache 25 | 26 | from config import whitelist_domains, vt_submissions as vts_config 27 | from vt_submit import vt_submissions_func 28 | # from pe_extract import pe_extract 29 | from extract_file import extract_file 30 | from db_file_dumps import db_file_dumps 31 | from db_virus_total import db_virus_total 32 | from manual_download import manual_download 33 | from ip2asn import ip2asn 34 | from get_feature_vector import get_feature_vector 35 | from classify_dump import classify_dump,update_score 36 | from db_syslog import db_syslog 37 | 38 | WAIT_TIME = 1 39 | DUMP_DIR = "../file_dump/dumps" 40 | RAW_DIR = "parsed/raw_files/" 41 | FILES_DIR = "parsed/captured_files/" 42 | MD_TIMEOUT = 180 43 | VT_TIMEOUT = 60 44 | 45 | md5host_cache = TTLCache(100000,ttl=60*10) 46 | MAX_MD5_CACHE_COUNT = 1 47 | 48 | hostcs_cache = TTLCache(100000,ttl=60*10) 49 | MAX_HOSTCS_CACHE_COUNT = 3 50 | 51 | # Makes a function call in a separate process 52 | # and makes sure it times out after 'timeout' seconds 53 | def process_timeout(func, func_args, timeout): 54 | p = Process(target=func, args=(func_args,)) 55 | p.start() 56 | p.join(timeout) 57 | p.terminate() 58 | 59 | 60 | def is_whitelisted(file_name): 61 | with open(file_name) as f: 62 | for _ in xrange(6): 63 | line = f.readline() 64 | if line.startswith("% Host:"): 65 | tok = line.split(':') 66 | if len(tok)>1: 67 | host = tok[1].strip() 68 | for domain in whitelist_domains: 69 | if host == domain or host.endswith('.'+domain): 70 | return True 71 | return False 72 | 73 | 74 | def get_file_hashes(file_path): 75 | with open(file_path, 'rb') as f: 76 | cont = f.read() 77 | sha1 = hashlib.sha1(cont).hexdigest() 78 | md5 = hashlib.md5(cont).hexdigest() 79 | file_size = os.stat(file_path).st_size 80 | return sha1, md5, file_size 81 | 82 | 83 | def process_file(raw_path, file_name): 84 | file_type,file_path,file_extension = extract_file(raw_path) 85 | print "raw_file:", raw_path 86 | print "file_path:", file_path 87 | if not file_type: 88 | print "This is NOT a file of interest! " 89 | print "Removing raw data from disk:", raw_path 90 | # remove the related raw file 91 | os.remove(raw_path) 92 | print "Removed!" 93 | return 94 | print "file_type:", file_type 95 | 96 | # If we are really dealing with a PE file 97 | sha1, md5, file_size = get_file_hashes(file_path) 98 | dump_id, corrupt_pe, host, client, server = db_file_dumps(raw_path, sha1, md5, file_size, file_type) 99 | 100 | skip_classification = False 101 | score = None 102 | 103 | # check if we have already recently classified the same md5 dump from the same host 104 | md5_cache_key = md5 105 | if host is not None: 106 | md5_cache_key += '-'+host 107 | if md5_cache_key in md5host_cache.keys(): 108 | md5host_cache[md5_cache_key]['count'] += 1 109 | if md5host_cache[md5_cache_key]['count'] > MAX_MD5_CACHE_COUNT: 110 | # do not classify again! retrieve cached score 111 | skip_classification = True 112 | score = md5host_cache[md5_cache_key]['score'] # get the last cached score 113 | print "MD5 CACHE: will use previous score : %s %s %s %s" %(dump_id,md5,host,score) 114 | elif not corrupt_pe: 115 | md5host_cache[md5_cache_key] = {'count':1, 'score':None} 116 | 117 | # check if we have already recently classified several dumps from the same host,client,server 118 | hostcs_cache_key = '' 119 | if host is not None: 120 | hostcs_cache_key += host 121 | hostcs_cache_key += '-'+client 122 | hostcs_cache_key += '-'+server 123 | if hostcs_cache_key in hostcs_cache.keys(): 124 | hostcs_cache[hostcs_cache_key]['count'] += 1 125 | if hostcs_cache[hostcs_cache_key]['count'] > MAX_HOSTCS_CACHE_COUNT: 126 | # do not classify again! retrieve cached score 127 | skip_classification = True 128 | if score is None: 129 | score = hostcs_cache[hostcs_cache_key]['score'] # get the last cached score 130 | print "HOSTCS CACHE: will use previous score : %s %s %s %s" %(dump_id,host,server,score) 131 | elif not corrupt_pe: 132 | hostcs_cache[hostcs_cache_key] = {'count':1, 'score':None} 133 | 134 | 135 | if not corrupt_pe and (not skip_classification or score is None): 136 | ip2asn(dump_id) 137 | get_feature_vector(dump_id,file_type) 138 | score = classify_dump(dump_id) 139 | md5host_cache[md5_cache_key]['score'] = score # update cached score 140 | hostcs_cache[hostcs_cache_key]['score'] = score # update cached score 141 | 142 | # query VT 143 | Process(target=process_timeout, 144 | args=(db_virus_total, (dump_id,), VT_TIMEOUT)).start() 145 | if vts_config == "manual": # attempt to re-download the file "manually" 146 | Process(target=process_timeout, 147 | args=(manual_download, sha1, MD_TIMEOUT)).start() 148 | 149 | if not corrupt_pe: 150 | if score is None: print "ERROR : None score : this should not happen! dump_id=", dump_id 151 | if skip_classification and not score is None: 152 | update_score(dump_id,score) 153 | print "Syslog score = %s (dump_id=%s)" % (score, dump_id) 154 | Process(target=db_syslog, args=(dump_id,score)).start() 155 | 156 | sha1_path = os.path.join( 157 | FILES_DIR, "%s.%s" % (sha1,file_extension)) 158 | md5_path = os.path.join( 159 | FILES_DIR, "%s.%s" % (md5,file_extension)) 160 | shutil.move(file_path, sha1_path) 161 | print "sha1_path", sha1_path 162 | print "md5_path", md5_path 163 | if not os.path.exists(md5_path): 164 | os.symlink("%s.%s" % (sha1,file_extension), md5_path) 165 | print "Done processing file: %s" % (raw_path,) 166 | 167 | 168 | def start_amico(): 169 | Process(target=vt_submissions_func).start() 170 | print "Started amico_scripts" 171 | while True: 172 | p = subprocess.Popen( 173 | 'ls -atr %s |egrep "\:[0-9]+\-[0-9]+$" | egrep -v "\.tmp$"' % 174 | (DUMP_DIR,), 175 | stdout=subprocess.PIPE, shell=True) 176 | output = p.communicate()[0] 177 | file_names = [i.strip() for i in output.split('\n') if i.strip() != ''] 178 | for file_name in file_names: 179 | file_path = os.path.join(DUMP_DIR, file_name) 180 | if not is_whitelisted(file_path): 181 | raw_path = os.path.join(RAW_DIR, file_name) 182 | shutil.copy(file_path, RAW_DIR) 183 | try: 184 | process_file(raw_path, file_name) 185 | except Exception as e: 186 | print "Exception in processing file %s" % (raw_path,) 187 | print e 188 | traceback.print_exc() 189 | else: 190 | print "domain in %s is whitelisted. Ignoring..." % (file_path,) 191 | os.remove(file_path) 192 | time.sleep(WAIT_TIME) 193 | 194 | if __name__ == "__main__": 195 | start_amico() 196 | -------------------------------------------------------------------------------- /amico_scripts/trainer.py: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | # Copyright (C) 2014 Phani Vadrevu # 3 | # pvadrevu@uga.edu # 4 | # # 5 | # Distributed under the GNU Public License # 6 | # http://www.gnu.org/licenses/gpl.txt # 7 | # # 8 | # This program is free software; you can redistribute it and/or modify # 9 | # it under the terms of the GNU General Public License as published by # 10 | # the Free Software Foundation; either version 2 of the License, or # 11 | # (at your option) any later version. # 12 | # # 13 | ########################################################################### 14 | from datetime import timedelta, date, datetime 15 | import psycopg2.extras 16 | import psycopg2.extensions 17 | import subprocess 18 | import sys 19 | import os 20 | 21 | from train_config import training_days, training_start_date 22 | from features import features 23 | import util 24 | 25 | 26 | class Trainer: 27 | def __init__(self,): 28 | self.output_file = "train.arff" 29 | self.conn = util.connect_to_db() 30 | self.conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_READ_COMMITTED) 31 | self.clean_label_delta = timedelta(days=30) 32 | self.training_end_date = date.today() 33 | if training_start_date: 34 | self.training_start_date = datetime.strptime(training_start_date, 35 | "%Y-%m-%d") 36 | else: 37 | cursor = self.conn.cursor() 38 | cursor.execute(""" 39 | SELECT MIN(timestamp) 40 | FROM pe_dumps""") 41 | if cursor.rowcount > 0: 42 | self.training_start_date = cursor.fetchone()[0].date() 43 | else: 44 | print "No entries in the database to train!" 45 | sys.exit() 46 | cursor.close() 47 | if training_days: 48 | self.training_end_date = (self.training_start_date + 49 | timedelta(days=training_days)) 50 | print "Training start date:", self.training_start_date.strftime("%B %d, %Y") 51 | print "Training end date:", self.training_end_date.strftime("%B %d, %Y") 52 | 53 | def count(self,): 54 | self.benign_dumps = self.get_benign_dumps() 55 | self.malicious_dumps = self.get_malicious_dumps() 56 | print "# benign dumps", len(self.benign_dumps) 57 | print "# malware dumps", len(self.malicious_dumps) 58 | 59 | def train(self,): 60 | model_name = datetime.today().strftime("%b%d_%y_%H%M%S") 61 | model_output_file = "models/%s.model" % (model_name,) 62 | self.benign_dumps = self.get_benign_dumps() 63 | self.malicious_dumps = self.get_malicious_dumps() 64 | print "# benign dumps", len(self.benign_dumps) 65 | print "# malware dumps", len(self.malicious_dumps) 66 | self.print_arff() 67 | subprocess.call(""" 68 | java -Xmx2000m -cp ./weka.jar weka.classifiers.meta.FilteredClassifier -t train.arff -d %s -p 1,58,59 -distribution -F "weka.filters.unsupervised.attribute.RemoveType -T string" -W weka.classifiers.trees.RandomForest -- -K 0 -S 1 -I 50 > logs/training/%s.log 69 | """ % (model_output_file, model_name), shell=True) 70 | print "New model trained: %s" % (model_output_file,) 71 | print "Log file: logs/training/%s.log" % (model_name,) 72 | os.remove("train.arff") 73 | 74 | def get_arff_line(self, dump_id, is_benign): 75 | self.cursor = self.conn.cursor( 76 | cursor_factory=psycopg2.extras.NamedTupleCursor) 77 | values = [] 78 | self.cursor.execute(""" 79 | SELECT * FROM weka_features 80 | WHERE dump_id = %s""", 81 | (dump_id, )) 82 | if self.cursor.rowcount == 0: 83 | return 84 | res = self.cursor.fetchone() 85 | res = res._asdict() 86 | for feature in features: 87 | values.append(res[feature]) 88 | try: 89 | data_string = ','.join(['?' if (value is None or value is '') else 90 | str(value) for value in values]) 91 | except Exception as e: 92 | print "Error in generating the feature vector in ARFF", e 93 | return 94 | if is_benign: 95 | data_string += ",neg" 96 | else: 97 | data_string += ",pos" 98 | self.cursor.close() 99 | return data_string 100 | 101 | def print_arff(self,): 102 | w = open(self.output_file, 'w') 103 | w.write('@RELATION train\n\n') 104 | for feature in features: 105 | if feature in ['sha1', 'dump_id', 'host', 'corrupt', 106 | 'vt_month_shelf', 'url_struct']: 107 | data_type = "STRING" 108 | elif feature == "extension_class": 109 | data_type = ("{common_ext,unknown_ext,common_fake,other_ext," 110 | "no_url,no_ext}") 111 | else: 112 | data_type = "NUMERIC" 113 | w.write('@ATTRIBUTE %s %s\n' % (feature, data_type)) 114 | #print "%s : %s" % (key, res[key]) 115 | 116 | w.write('@ATTRIBUTE class {pos, neg}\n\n') 117 | w.write('@DATA\n\n') 118 | for dump_id in self.benign_dumps: 119 | arff_line = self.get_arff_line(dump_id, True) 120 | if arff_line: 121 | w.write(arff_line + '\n') 122 | for dump_id in self.malicious_dumps: 123 | arff_line = self.get_arff_line(dump_id, False) 124 | if arff_line: 125 | w.write(arff_line + '\n') 126 | w.close() 127 | 128 | def get_benign_dumps(self,): 129 | self.cursor = self.conn.cursor() 130 | self.cursor.execute(""" 131 | SELECT DISTINCT(sha1) 132 | FROM 133 | virus_total_scans as vts JOIN 134 | virus_total_submissions as vt_sub 135 | USING (sha1) 136 | WHERE 137 | vt_sub.scan_time - vts.scan_time > %s 138 | AND vt_sub.num_av_labels = 0 139 | """, (self.clean_label_delta,)) 140 | hashes = set(self.cursor.fetchall()) 141 | self.cursor.execute(""" 142 | SELECT DISTINCT(sha1) 143 | FROM 144 | virus_total_submissions as vts JOIN 145 | virus_total_submissions as vt_sub 146 | USING (sha1) 147 | WHERE 148 | vt_sub.scan_time - vts.scan_time > %s 149 | AND vt_sub.num_av_labels = 0 150 | """, (self.clean_label_delta,)) 151 | hashes.update(self.cursor.fetchall()) 152 | dumps = set() 153 | for sha1 in hashes: 154 | self.cursor.execute(""" 155 | SELECT dump_id 156 | FROM pe_dumps 157 | WHERE timestamp >= %s AND 158 | timestamp <= %s AND 159 | sha1 = %s 160 | """, (self.training_start_date, self.training_end_date, 161 | sha1)) 162 | dumps.update(self.cursor.fetchall()) 163 | self.cursor.close() 164 | return dumps 165 | 166 | def get_malicious_dumps(self,): 167 | self.cursor = self.conn.cursor() 168 | self.cursor.execute(""" 169 | SELECT DISTINCT(sha1) 170 | FROM 171 | virus_total_scans as vts JOIN 172 | virus_total_submissions as vt_sub 173 | USING (sha1) 174 | WHERE 175 | vt_sub.scan_time - vts.scan_time > %s 176 | AND vt_sub.trusted_av_labels >= 2 177 | """, (self.clean_label_delta,)) 178 | hashes = set(self.cursor.fetchall()) 179 | self.cursor.execute(""" 180 | SELECT DISTINCT(sha1) 181 | FROM 182 | virus_total_submissions as vts JOIN 183 | virus_total_submissions as vt_sub 184 | USING (sha1) 185 | WHERE 186 | vt_sub.scan_time - vts.scan_time > %s 187 | AND vt_sub.trusted_av_labels >= 2 188 | """, (self.clean_label_delta,)) 189 | hashes.update(self.cursor.fetchall()) 190 | dumps = set() 191 | for sha1 in hashes: 192 | self.cursor.execute(""" 193 | SELECT dump_id 194 | FROM pe_dumps 195 | WHERE timestamp >= %s AND 196 | timestamp <= %s AND 197 | sha1 = %s 198 | """, (self.training_start_date, self.training_end_date, 199 | sha1)) 200 | dumps.update(self.cursor.fetchall()) 201 | self.cursor.close() 202 | return dumps 203 | 204 | if __name__ == "__main__": 205 | trainer = Trainer() 206 | if len(sys.argv) > 1 and sys.argv[1] == "-c": 207 | trainer.count() 208 | else: 209 | trainer.train() 210 | -------------------------------------------------------------------------------- /amico_scripts/vt_submit.py: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | # Copyright (C) 2011-2013 Phani Vadrevu and Roberto Perdisci # 3 | # pvadrevu@uga.edu # 4 | # # 5 | # Distributed under the GNU Public License # 6 | # http://www.gnu.org/licenses/gpl.txt # 7 | # # 8 | # This program is free software; you can redistribute it and/or modify # 9 | # it under the terms of the GNU General Public License as published by # 10 | # the Free Software Foundation; either version 2 of the License, or # 11 | # (at your option) any later version. # 12 | # # 13 | ########################################################################### 14 | 15 | import sys 16 | from datetime import timedelta, date 17 | import time 18 | 19 | import simplejson 20 | import logging 21 | import logging.config 22 | 23 | from config import * 24 | import vt_api 25 | import util 26 | 27 | LOG_CONF_FILE = "logging.conf" 28 | 29 | class VTSubmissions: 30 | def __init__(self): 31 | self.QUERY_RATE_LIMIT = 10 32 | self.ONE_MIN = 60 33 | 34 | logging.config.fileConfig(LOG_CONF_FILE) 35 | self.logger = logging.getLogger("amico_logger") 36 | #stdout_handler = logging.StreamHandler(sys.stdout) 37 | #stdout_handler.setLevel(logging.DEBUG) 38 | #formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s' 39 | #'- %(message)s') 40 | #stdout_handler.setFormatter(formatter) 41 | #self.logger.addHandler(stdout_handler) 42 | 43 | util.setup_socks() 44 | self.conn = util.connect_to_db() 45 | self.cursor = self.conn.cursor() 46 | 47 | self.today = date.today().strftime("%Y-%m-%d") 48 | self.yesterday = (date.today() - 49 | timedelta(days=1)).strftime("%Y-%m-%d") 50 | self.last_month = (date.today() - 51 | timedelta(days=30)).strftime("%Y-%m-%d") 52 | 53 | def get_hashes_from_db(self): 54 | if vt_submissions == "manual": 55 | hashes = self.get_hashes_from_db_manual() 56 | elif vt_submissions == "live": 57 | hashes = self.get_hashes_from_db_live() 58 | else: 59 | hashes = self.get_hashes_from_db_scans() 60 | 61 | self.logger.debug("get_hashes_from_db(): Yesterday's hahses: %s", len(hashes)) 62 | self.hashes = self.update_hashes(hashes) 63 | 64 | def update_hashes(self, hashes): 65 | self.cursor.execute(""" 66 | SELECT distinct md5, sha1 67 | FROM virus_total_submissions 68 | WHERE (submit_time::date) = %s 69 | """, (self.last_month,)) 70 | if self.cursor.rowcount > 0: 71 | hashes = hashes.union(self.cursor.fetchall()) 72 | self.cursor.execute(""" 73 | SELECT distinct md5, sha1 74 | FROM virus_total_submissions 75 | WHERE (submit_time::date) > %s AND 76 | (submit_time::date) < %s 77 | """, (self.last_month, self.yesterday)) 78 | if self.cursor.rowcount > 0: 79 | hashes = hashes.difference(self.cursor.fetchall()) 80 | self.cursor.execute(""" 81 | SELECT distinct md5, sha1 82 | FROM virus_total_submissions 83 | WHERE (submit_time::date) = %s 84 | """, (self.today,)) 85 | if self.cursor.rowcount > 0: 86 | hashes = hashes.difference(self.cursor.fetchall()) 87 | self.logger.debug("update_hashes(): Number of hashes: %s", len(hashes)) 88 | return hashes 89 | 90 | def get_hashes_from_db_scans(self): 91 | self.cursor.execute(""" 92 | SELECT distinct md5, sha1 93 | FROM virus_total_scans 94 | WHERE json IS NOT NULL AND 95 | query_time::date = %s 96 | """, (self.yesterday,)) 97 | if self.cursor.rowcount > 0: 98 | hashes = set(self.cursor.fetchall()) 99 | else: 100 | hashes = set() 101 | return hashes 102 | 103 | def get_hashes_from_db_live(self): 104 | self.cursor.execute(""" 105 | SELECT distinct md5, sha1 106 | FROM pe_dumps 107 | WHERE sha1 IS NOT NULL AND 108 | timestamp::date = %s 109 | """, (self.yesterday,)) 110 | if self.cursor.rowcount > 0: 111 | hashes = set(self.cursor.fetchall()) 112 | else: 113 | hashes = set() 114 | return hashes 115 | 116 | def get_hashes_from_db_manual(self): 117 | self.logger.debug("entered get_hashes_from_db_manual()") 118 | self.cursor.execute(""" 119 | SELECT distinct md5, sha1 120 | FROM manual_download_checksums 121 | WHERE referer_exists = 'f' AND 122 | sha1 IS NOT NULL AND 123 | timestamp::date = %s 124 | """, (self.yesterday,)) 125 | if self.cursor.rowcount > 0: 126 | hashes = set(self.cursor.fetchall()) 127 | else: 128 | hashes = set() 129 | return hashes 130 | 131 | def insert_scan(self, sha1, md5, response): 132 | self.logger.debug("entered insert_scan()") 133 | self.cursor.execute(""" 134 | INSERT INTO virus_total_submissions 135 | (submit_time, sha1, md5, scan_id) 136 | VALUES (LOCALTIMESTAMP, %s, %s, %s) 137 | RETURNING vt_submit_id 138 | """, (sha1, md5, response['scan_id'])) 139 | vt_submit_id = self.cursor.fetchone()[0] 140 | self.cursor.execute(""" 141 | UPDATE virus_total_submissions 142 | SET resubmit_id = %s 143 | WHERE sha1= %s AND 144 | submit_time::date = %s 145 | """, (vt_submit_id, sha1, self.last_month)) 146 | 147 | def check_report_exists(self, sha1): 148 | self.cursor.execute(""" 149 | SELECT * FROM virus_total_scans 150 | WHERE sha1 = %s AND 151 | scan_time IS NOT NULL""", (sha1, )) 152 | report_exists = True if self.cursor.rowcount else False 153 | self.cursor.execute(""" 154 | SELECT * FROM virus_total_submissions 155 | WHERE sha1 = %s AND 156 | json IS NOT NULL""", (sha1, )) 157 | report_exists = True if self.cursor.rowcount else report_exists 158 | return report_exists 159 | 160 | def make_request(self, md5, sha1): 161 | self.logger.debug("entered make_request()") 162 | self.logger.debug("sha1: %s", sha1) 163 | report_exists = self.check_report_exists(sha1) 164 | self.logger.debug("report_exists: %s", report_exists) 165 | json = None 166 | try: 167 | json = (vt_api.rescan_request(md5) if report_exists else 168 | vt_api.send_file(md5)) 169 | if json: 170 | response = simplejson.loads(json) 171 | if response["response_code"] == 1: 172 | self.insert_scan(sha1, md5, response) 173 | return True 174 | else: 175 | self.logger.warning("make_request: Bad response code: %s", 176 | response["response_code"]) 177 | else: 178 | self.logger.warning("make_request: No JSON response") 179 | except Exception as e: 180 | self.logger.exception("report_exists: %s", report_exists) 181 | self.logger.exception("json: %s", json) 182 | self.logger.exception("sha1: %s", sha1) 183 | self.logger.exception("make_request: Error %s", e) 184 | return False 185 | 186 | def submit_hashes(self): 187 | self.logger.debug("entered submit_hashes()") 188 | query_count = 0 189 | done_hashes = set() 190 | for md5, sha1 in self.hashes: 191 | tries = 0 192 | # This loop makes max 3 attempts to send a scan request 193 | while tries <= 3: 194 | if query_count == self.QUERY_RATE_LIMIT: 195 | self.logger.debug( 196 | "Query limit reached. Sleeping for a min.") 197 | time.sleep(self.ONE_MIN) 198 | query_count = 0 199 | tries += 1 200 | query_count += 1 201 | if self.make_request(md5, sha1): 202 | done_hashes.add((md5, sha1)) 203 | break 204 | if len(self.hashes): 205 | self.logger.debug("Submitted the hashes on: %s", date.today()) 206 | self.hashes.difference_update(done_hashes) 207 | 208 | def update_table_with_report(self, scan_id, report, json): 209 | self.logger.debug("entered update_table_with_report()") 210 | scan_time = report["scan_date"] 211 | scans = report["scans"] 212 | num_av_labels = report["positives"] 213 | trusted_av_labels = 0 214 | for k, v in scans.iteritems(): 215 | if v["detected"] is True: 216 | if k in trusted_av_vendors: 217 | trusted_av_labels += 1 218 | scan_time += " UTC" 219 | self.cursor.execute(""" 220 | UPDATE virus_total_submissions 221 | SET trusted_av_labels = %s, 222 | num_av_labels = %s, 223 | scan_time = TIMESTAMP WITH TIME ZONE %s, 224 | json = %s 225 | WHERE scan_id = %s and json is NULL""", 226 | (trusted_av_labels, num_av_labels, scan_time, 227 | json, scan_id)) 228 | 229 | def fetch_reports(self): 230 | self.logger.debug("entered fetch_reports()") 231 | self.cursor.execute(""" 232 | SELECT scan_id 233 | FROM virus_total_submissions 234 | WHERE json is NULL and 235 | (LOCALTIMESTAMP - submit_time) > '5 minutes' and 236 | (LOCALTIMESTAMP - submit_time) < '3 days' 237 | ORDER BY submit_time ASC""") 238 | scan_ids = [row[0] for row in self.cursor.fetchall()] 239 | self.logger.debug("fetch_reports(): %s scan reports to be fetched", 240 | len(scan_ids)) 241 | query_count = 0 242 | for scan_id in scan_ids: 243 | if query_count == self.QUERY_RATE_LIMIT: 244 | self.logger.debug( 245 | "Query limit reached. Sleeping for a min.") 246 | time.sleep(self.ONE_MIN) 247 | query_count = 0 248 | query_count += 1 249 | try: 250 | json = vt_api.get_vt_report(scan_id) 251 | if not json: 252 | self.logger.debug("No json") 253 | continue 254 | report = simplejson.loads(json) 255 | # Sometimes, we get the old reports wrongly 256 | if (report["response_code"] != 1) or ( 257 | report['scan_id'] != scan_id): 258 | self.logger.debug("Response code %s for scan_id %s" % 259 | (report["response_code"], scan_id)) 260 | continue 261 | self.update_table_with_report(scan_id, report, json) 262 | except Exception as e: 263 | self.logger.exception( 264 | "Error in fetching report for scan_id %s: %s" % (scan_id, e)) 265 | continue 266 | 267 | 268 | def sleep_for_the_day(): 269 | today = date.today() 270 | while today == date.today(): 271 | time.sleep(15 * 60) 272 | 273 | 274 | def vt_submissions_func(): 275 | vt_submit = VTSubmissions() 276 | vt_submit.get_hashes_from_db() 277 | while True: 278 | try: 279 | vt_submit.submit_hashes() 280 | vt_submit.fetch_reports() 281 | except Exception as e: 282 | vt_submit.logger.exception( 283 | "Unexpected error! %s \n Sleeping for the rest of the day", e) 284 | sleep_for_the_day() 285 | 286 | vt_submit.logger.debug("main(): Sleeping for 15 min.") 287 | time.sleep(vt_submit.ONE_MIN * 15) 288 | 289 | today = date.today().strftime("%Y-%m-%d") 290 | if today != vt_submit.today: 291 | vt_submit.today = today 292 | vt_submit.yesterday = (date.today() - 293 | timedelta(days=1)).strftime("%Y-%m-%d") 294 | vt_submit.last_month = (date.today() - 295 | timedelta(days=30)).strftime("%Y-%m-%d") 296 | vt_submit.get_hashes_from_db() 297 | 298 | 299 | if __name__ == "__main__": 300 | vt_submissions_func() 301 | -------------------------------------------------------------------------------- /file_dump/lru-cache.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This is an implementation of a O(1) LRU cache. 3 | * Copyright (C) 2010 Roberto Perdisci (perdisci@cs.uga.edu) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | */ 18 | 19 | #include 20 | #include "lru-cache.h" 21 | 22 | // #define LRUC_DEBUG 23 | #define HT_SIZE_FACTOR 10 24 | #define LRUC_MIN_ENTRIES 10 25 | 26 | /* Initializes the Hash Table */ 27 | hash_table_t* ht_init(u_int length) { 28 | 29 | int i; 30 | 31 | hash_table_t* ht = (hash_table_t*)malloc(sizeof(hash_table_t)); 32 | ht->length = length * HT_SIZE_FACTOR; 33 | ht->vect = (ht_entry_t**)malloc(sizeof(ht_entry_t*) * ht->length); 34 | for(i=0; i < ht->length; i++) 35 | ht->vect[i] = NULL; 36 | 37 | return ht; 38 | 39 | } 40 | 41 | 42 | /* Deallocate memory for Hash Table */ 43 | void ht_destroy(hash_table_t* ht) { 44 | 45 | ht_entry_t *v; 46 | u_int i; 47 | 48 | if(ht == NULL) 49 | return; 50 | 51 | for(i=0; i < ht->length; i++) { 52 | v = ht->vect[i]; 53 | while(v != NULL) { 54 | ht_entry_t *p = v; 55 | v = v->next; 56 | #ifdef LRUC_DEBUG 57 | printf("Destroying ht vect entry!\n"); 58 | fflush(stdout); 59 | #endif 60 | free(p); 61 | } 62 | } 63 | 64 | free(ht->vect); 65 | ht->vect = NULL; 66 | 67 | free(ht); 68 | 69 | } 70 | 71 | 72 | void default_destroy_val_fn(void *v) { 73 | free(v); 74 | } 75 | 76 | 77 | /* Initializes the LRU cache in the special case of char* values */ 78 | lru_cache_t* lruc_init_str(u_int max_entries) { 79 | return lruc_init(max_entries, default_destroy_val_fn); 80 | } 81 | 82 | 83 | /* Initializes the LRU cache */ 84 | lru_cache_t* lruc_init(u_int max_entries, void (*destroy_val_fn)(void*)) { 85 | 86 | lru_cache_t* lruc = (lru_cache_t*)malloc(sizeof(lru_cache_t)); 87 | lruc->ht = ht_init(max_entries); 88 | lruc->top = NULL; 89 | if(destroy_val_fn != NULL) 90 | lruc->destroy_val_fn = destroy_val_fn; 91 | else 92 | lruc->destroy_val_fn = default_destroy_val_fn; 93 | lruc->num_entries = 0; 94 | lruc->max_entries = LRUC_MIN_ENTRIES; // we should force at least these many entries 95 | if(max_entries > LRUC_MIN_ENTRIES) 96 | lruc->max_entries = max_entries; 97 | 98 | return lruc; 99 | } 100 | 101 | 102 | /* Deallocate memory for LRU cache */ 103 | void lruc_destroy(lru_cache_t *lruc) { 104 | 105 | if(lruc == NULL) 106 | return; 107 | 108 | if(lruc->top == NULL) 109 | return; 110 | 111 | if(lruc->top->prev == NULL) { // only one entry... 112 | free(lruc->top->key); 113 | if(lruc->top->value != NULL) 114 | lruc->destroy_val_fn(lruc->top->value); 115 | free(lruc->top); 116 | return; 117 | } 118 | 119 | lruc->top->prev->next = NULL; // break the circular list 120 | while(lruc->top != NULL) { 121 | lruc_entry_t *t = lruc->top; 122 | lruc->top = lruc->top->next; 123 | free(t->key); 124 | if(t->value != NULL) { 125 | lruc->destroy_val_fn(t->value); 126 | t->value = NULL; 127 | } 128 | free(t); 129 | } 130 | 131 | ht_destroy(lruc->ht); 132 | lruc->ht = NULL; 133 | 134 | free(lruc); 135 | 136 | } 137 | 138 | 139 | /* Inserts an element into the Hash Table 140 | * 'lruc_e' is a pointer to the (key,value) entry in the LRU cache 141 | * related to the 'key' parameter 142 | */ 143 | void ht_insert(hash_table_t *ht, lruc_entry_t *lruc_e, const char *key) { 144 | 145 | ht_entry_t *v; 146 | 147 | u_int h = hash_fn(key) % ht->length; 148 | ht_entry_t *e = (ht_entry_t*)malloc(sizeof(ht_entry_t)); 149 | e->key = key; 150 | e->le = lruc_e; 151 | e->next = NULL; 152 | 153 | v = ht->vect[h]; 154 | if(v == NULL) { 155 | ht->vect[h] = e; 156 | return; 157 | } 158 | 159 | while(v->next != NULL) 160 | v = v->next; 161 | 162 | v->next = e; 163 | 164 | } 165 | 166 | /* Inserts and (key,value) pair in the LRU cache. 167 | * Notice that value could be NULL, but the key cannot be NULL 168 | */ 169 | 170 | int lruc_insert_str(lru_cache_t *lruc, const char *key, const char* value) { 171 | 172 | int ret = lruc_insert(lruc, key, NULL); 173 | if(value!=NULL) { 174 | lruc_entry_t *e = ht_search(lruc->ht, key); 175 | e->value = (char*)malloc(sizeof(char)*(strlen(value)+1)); 176 | strcpy(e->value, value); 177 | } 178 | 179 | return ret; 180 | 181 | } 182 | 183 | int lruc_insert(lru_cache_t *lruc, const char *key, void* value) { 184 | 185 | if(key == NULL) 186 | return -1; 187 | 188 | if(lruc_search(lruc, key)!=NULL) 189 | return -1; 190 | 191 | lruc->num_entries++; 192 | #ifdef LRUC_DEBUG 193 | printf("Inserting %u\n", lruc->num_entries); 194 | #endif 195 | 196 | lruc_entry_t *e = (lruc_entry_t*)malloc(sizeof(lruc_entry_t)); 197 | e->key = (char*)malloc(sizeof(char)*(strlen(key)+1)); 198 | strcpy(e->key, key); 199 | e->value = value; 200 | e->time = time(NULL); 201 | 202 | /* the cache is implemented as a doubly-linked circular list */ 203 | if(lruc->top == NULL) { 204 | e->next = e; 205 | e->prev = e; 206 | } 207 | else if(lruc->num_entries <= lruc->max_entries) { 208 | e->prev = lruc->top->prev; 209 | e->next = lruc->top; 210 | 211 | lruc->top->prev->next = e; 212 | lruc->top->prev = e; 213 | } 214 | else { 215 | // printf("LRUC is full!\n"); 216 | // fflush(stdout); 217 | 218 | e->next = lruc->top; 219 | e->prev = lruc->top->prev->prev; 220 | lruc->top->prev->prev->next = e; 221 | lruc_entry_t *tmp = lruc->top->prev; 222 | lruc->top->prev = e; 223 | 224 | // evict from the cache 225 | ht_delete(lruc->ht, tmp->key); 226 | free(tmp->key); 227 | if(tmp->value != NULL) 228 | lruc->destroy_val_fn(tmp->value); 229 | free(tmp); 230 | 231 | lruc->num_entries--; 232 | 233 | // printf("Removed LRU element; inserted the new one!\n"); 234 | // fflush(stdout); 235 | } 236 | 237 | lruc->top = e; 238 | 239 | /* Insert e in the Hash Table for fast, O(1) searches */ 240 | ht_insert(lruc->ht, e, e->key); 241 | 242 | #ifdef LRUC_DEBUG 243 | printf("Inserted!\n", lruc->num_entries); 244 | print_ht(lruc->ht); 245 | #endif 246 | 247 | return 0; 248 | } 249 | 250 | 251 | /* Delete key from Hash Table */ 252 | void ht_delete(hash_table_t *ht, const char *key) { 253 | 254 | ht_entry_t *v; 255 | ht_entry_t *prev; 256 | 257 | u_int h = hash_fn(key) % ht->length; 258 | #ifdef LRUC_DEBUG 259 | printf("key=%s, h=%u\n", key, h); 260 | #endif 261 | 262 | v = ht->vect[h]; 263 | prev = NULL; 264 | if(v != NULL) { 265 | do { 266 | if(strcmp(key, v->key) == 0) { 267 | if(prev != NULL) 268 | prev->next = v->next; 269 | else if(v->next != NULL) 270 | ht->vect[h] = v->next; 271 | else 272 | ht->vect[h] = NULL; 273 | free(v); 274 | return; 275 | } 276 | prev = v; 277 | v = v->next; 278 | } while(v != NULL); 279 | } 280 | 281 | return; 282 | 283 | } 284 | 285 | // Delete an entry from the LRU cache 286 | void lruc_delete(lru_cache_t *lruc, const char *key) { 287 | 288 | lruc_entry_t *e = ht_search(lruc->ht, key); 289 | 290 | if(e!=NULL) { 291 | if(lruc->top == e && lruc->top->next == e) // only one entry! 292 | lruc->top = NULL; 293 | else { 294 | if(lruc->top == e) 295 | lruc->top = e->next; 296 | 297 | e->prev->next = e->next; 298 | e->next->prev = e->prev; 299 | } 300 | 301 | ht_delete(lruc->ht, e->key); 302 | free(e->key); 303 | if(e->value != NULL) 304 | lruc->destroy_val_fn(e->value); 305 | free(e); 306 | 307 | lruc->num_entries--; 308 | } 309 | 310 | } 311 | 312 | 313 | /* Searches an LRU cache entry using the Hash Table */ 314 | lruc_entry_t* ht_search(const hash_table_t *ht, const char *key) { 315 | 316 | ht_entry_t *v; 317 | 318 | u_int h = hash_fn(key) % ht->length; 319 | v = ht->vect[h]; 320 | 321 | #ifdef LRUC_DEBUG 322 | printf("Hash Key = %u\n", h); 323 | printf("v = %p\n", v); 324 | fflush(stdout); 325 | #endif 326 | 327 | if(v == NULL) 328 | return NULL; 329 | 330 | while(v != NULL) { 331 | #ifdef LRUC_DEBUG 332 | printf("v is not null!\n"); 333 | printf("key = %s\n", key); 334 | printf("v->key = %s\n", v->key); 335 | fflush(stdout); 336 | #endif 337 | 338 | if(strcmp(key, v->key) == 0) 339 | return v->le; 340 | v = v->next; 341 | } 342 | 343 | #ifdef LRUC_DEBUG 344 | printf("HT entry not found! Returing NULL\n"); 345 | fflush(stdout); 346 | #endif 347 | 348 | return NULL; 349 | 350 | } 351 | 352 | 353 | char* lruc_search_str(lru_cache_t *lruc, const char *key) { 354 | return (char*)lruc_search(lruc, key); 355 | } 356 | 357 | 358 | void* lruc_search(lru_cache_t *lruc, const char *key) { 359 | 360 | lruc_entry_t *e = ht_search(lruc->ht, key); 361 | 362 | #ifdef LRUC_DEBUG 363 | printf("e = %p\n", e); 364 | fflush(stdout); 365 | #endif 366 | 367 | if(e == NULL) 368 | return NULL; 369 | 370 | #ifdef LRUC_DEBUG 371 | printf("Found element in Hash Table (%s, %s)\n", e->key, e->value); 372 | fflush(stdout); 373 | #endif 374 | 375 | 376 | if(e != lruc->top) { 377 | /* e is the most recently used: move it to the top, if needed! */ 378 | e->prev->next = e->next; 379 | e->next->prev = e->prev; 380 | e->prev = lruc->top->prev; 381 | e->next = lruc->top; 382 | lruc->top->prev->next = e; 383 | lruc->top->prev = e; 384 | lruc->top = e; 385 | } 386 | 387 | e->time = time(NULL); 388 | 389 | if(e->value != NULL) 390 | return e->value; 391 | 392 | return e->key; // we don't want to return NULL if there is a match! 393 | // even if the value was NULL 394 | 395 | } 396 | 397 | 398 | u_int hash_fn(const char* key) { 399 | 400 | #define MAX_HASH_ITER 256 401 | return DJBHash(key, strnlen(key, MAX_HASH_ITER)); 402 | 403 | } 404 | 405 | 406 | /* The following hash function has been borrowed 407 | * and slightly modified from 408 | * http://www.partow.net/programming/hashfunctions/ 409 | * Author: Arash Partow 410 | */ 411 | u_int DJBHash(const char* str, u_int len) 412 | { 413 | u_int hash = 5381; 414 | u_int i = 0; 415 | 416 | for(i = 0; i < len; i++) 417 | { 418 | hash = ((hash << 5) + hash) + (str[i]); 419 | } 420 | 421 | return hash; 422 | } 423 | /***/ 424 | 425 | 426 | void print_ht(hash_table_t *ht) { 427 | 428 | ht_entry_t *v; 429 | u_int i; 430 | 431 | if(ht == NULL) 432 | return; 433 | 434 | for(i=0; i < ht->length; i++) { 435 | v = ht->vect[i]; 436 | if(v != NULL) { 437 | #ifdef LRUC_DEBUG 438 | printf("HASH_TAB_ENTRY: %s", v->key); 439 | #endif 440 | while(v->next!=NULL) { 441 | v = v->next; 442 | #ifdef LRUC_DEBUG 443 | printf(" | %s", v->key); 444 | #endif 445 | } 446 | #ifdef LRUC_DEBUG 447 | printf("\n"); 448 | #endif 449 | } 450 | } 451 | 452 | } 453 | 454 | 455 | 456 | void clean_lruc(lru_cache_t *lruc) { 457 | 458 | if(lruc==NULL) 459 | return; 460 | 461 | if(lruc->top == NULL) 462 | return; 463 | 464 | time_t t = time(NULL); 465 | // printf("Current Time = %u\n", t); 466 | 467 | do { 468 | lruc_entry_t *e = lruc->top->prev; 469 | // printf("e Time = %u\n", e->time); 470 | 471 | if((t - e->time) > MAX_LRUC_TTL) { 472 | if(lruc->destroy_val_fn != NULL) { 473 | lruc->destroy_val_fn(e->value); 474 | e->value = NULL; 475 | } 476 | lruc_delete(lruc, e->key); 477 | } 478 | else 479 | break; 480 | 481 | } while(lruc->top!=NULL); 482 | 483 | } 484 | 485 | 486 | void print_lruc(lru_cache_t *lruc) { 487 | 488 | if(lruc==NULL) 489 | return; 490 | 491 | if(lruc->top == NULL) 492 | return; 493 | 494 | lruc_entry_t *e = lruc->top; 495 | 496 | do { 497 | #ifdef LRUC_DEBUG 498 | printf("LRU_ENTRY: (k=%s , v=%s)\n", e->key, e->value); 499 | #endif 500 | e = e->next; 501 | } while(e != lruc->top); 502 | 503 | } 504 | 505 | 506 | /* A little bit of testing to make sure things are working correctly... */ 507 | /** 508 | int main() { 509 | 510 | char k[256]; 511 | char v[256]; 512 | int i; 513 | 514 | printf("Initializing LRU cache...\n"); 515 | lru_cache_t *lruc = lruc_init_str(10); 516 | fflush(stdout); 517 | 518 | 519 | for(i=0; i < 10; i++) { 520 | printf("Inserting (key,val)\n"); 521 | fflush(stdout); 522 | sprintf(k, "key%d", (i+1)); 523 | sprintf(v, "value%d", (i+1)); 524 | lruc_insert_str(lruc, k, v); 525 | print_ht(lruc->ht); 526 | printf("###################\n"); 527 | } 528 | 529 | print_ht(lruc->ht); 530 | print_lruc(lruc); 531 | printf("###################\n"); 532 | 533 | sprintf(k, "key%d", 8); 534 | printf("Searchign for k=%s\n", k); 535 | strcpy(v, lruc_search_str(lruc, k)); 536 | printf("Found v=%s\n", v); 537 | printf("###################\n"); 538 | 539 | for(i=10; i < 15; i++) { 540 | printf("Inserting (key,val)\n"); 541 | fflush(stdout); 542 | sprintf(k, "key%d", (i+1)); 543 | sprintf(v, "value%d", (i+1)); 544 | lruc_insert_str(lruc, k, v); 545 | print_ht(lruc->ht); 546 | printf("###################\n"); 547 | 548 | } 549 | 550 | print_ht(lruc->ht); 551 | print_lruc(lruc); 552 | 553 | for(i=6; i < 13; i++) { 554 | sprintf(k, "key%d", i); 555 | printf("Searchign for k=%s\n", k); 556 | strcpy(v, lruc_search_str(lruc, k)); 557 | printf("Found v=%s\n", v); 558 | printf("###################\n"); 559 | } 560 | 561 | print_ht(lruc->ht); 562 | print_lruc(lruc); 563 | printf("###################\n"); 564 | 565 | for(i=16; i < 18; i++) { 566 | printf("Inserting (key,val)\n"); 567 | fflush(stdout); 568 | sprintf(k, "key%d", i); 569 | sprintf(v, "value%d", i); 570 | lruc_insert_str(lruc, k, v); 571 | print_ht(lruc->ht); 572 | printf("###################\n"); 573 | 574 | } 575 | 576 | print_ht(lruc->ht); 577 | print_lruc(lruc); 578 | printf("###################\n"); 579 | 580 | sprintf(k, "key%d", 1); 581 | printf("Searchign for k=%s\n", k); 582 | if(lruc_search_str(lruc, k)!=NULL) { 583 | strcpy(v, lruc_search_str(lruc, k)); 584 | printf("Found v=%s\n", v); 585 | } 586 | printf("###################\n"); 587 | 588 | lruc_destroy(lruc); 589 | printf("Destroyed!\n"); 590 | 591 | } 592 | **/ 593 | 594 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /amico_scripts/get_feature_vector.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | ########################################################################### 3 | # Copyright (C) 2012 Phani Vadrevu and Roberto Perdisci # 4 | # pvadrevu@uga.edu # 5 | # # 6 | # Distributed under the GNU Public License # 7 | # http://www.gnu.org/licenses/gpl.txt # 8 | # # 9 | # This program is free software; you can redistribute it and/or modify # 10 | # it under the terms of the GNU General Public License as published by # 11 | # the Free Software Foundation; either version 2 of the License, or # 12 | # (at your option) any later version. # 13 | # # 14 | ########################################################################### 15 | import time 16 | import urlparse 17 | import re 18 | import psycopg2 19 | import util 20 | import sys 21 | import numpy as np 22 | import pandas as ps 23 | from datetime import timedelta 24 | from config import MAX_PAST_DUMPS, MAX_PAST_DAYS 25 | 26 | 27 | 28 | 29 | # TODO: Don't let the hash_life_time and num_dumps_with_same_hash values be null 30 | # TODO: Speed up the script 31 | # TODO: Verify how null values of x_malware_ratio features are handled by WEKA 32 | 33 | 34 | 35 | def compute_features_hts(df): 36 | """ Compute host/2ld/server-based features, depending on df """ 37 | 38 | # TODO: Move these thresholds to config file? 39 | FEAT_TAVS_THRESHOLD = 1 40 | FEAT_NAVS_THRESHOLD = 0 41 | 42 | td = len(set(df['dump_id'])) 43 | bd = len(set(df[df.navs==0]['dump_id'])) 44 | md = len(set(df[df.tavs>FEAT_TAVS_THRESHOLD]['dump_id'])) 45 | sd = len(set(df[df.navs>FEAT_NAVS_THRESHOLD]['dump_id'])) 46 | th = len(set(df['sha1'])) 47 | uh = len(set(df[df.navs.isnull()]['sha1'])) 48 | 49 | br = mr = sr = ur = None 50 | if td > 0: 51 | br = float(bd)/td 52 | mr = float(md)/td 53 | sr = float(sd)/td 54 | if th > 0 : 55 | ur = float(uh)/th 56 | 57 | avg_navs = None 58 | avg_tavs = None 59 | 60 | sha1_nav_labels = [] 61 | sha1_tav_labels = [] 62 | for (sha,dfgroup) in df.groupby('sha1'): 63 | # sort to get only the most recent number of av labels for a given sha1 64 | d = dfgroup.sort_values('dump_id',ascending=False) 65 | if d['navs'].iat[0] != None and not np.isnan(d['navs'].iat[0]): 66 | sha1_nav_labels.append(d['navs'].iat[0]) 67 | sha1_tav_labels.append(d['tavs'].iat[0]) 68 | if(len(sha1_nav_labels) > 0): 69 | # average over different sha1s 70 | avg_navs = np.mean(sha1_nav_labels) 71 | avg_tavs = np.mean(sha1_tav_labels) 72 | 73 | return { 'total_downloads':td,'benign_downloads':bd,'malware_downloads':md,'suspicious_downloads':sd, 74 | 'total_hashes':th,'unknown_hashes':uh, 75 | 'benign_ratio':br,'malware_ratio':mr,'suspicious_ratio':sr,'unknown_hash_ratio':ur, 76 | 'avg_av_labels':avg_navs,'avg_trusted_labels':avg_tavs } 77 | 78 | 79 | def insert_hts_based_features(cursor, dump_id): 80 | """ Computes host/2ld/server-based features for a given download 81 | 82 | Arguments: 83 | cursor: DB cursort from existing DB connection 84 | dump_id: id of download to be classified 85 | 86 | """ 87 | 88 | # also query for timestamp, so we can use to limit how much we go back in time! 89 | query = " SELECT host,server,DATE(timestamp) FROM pe_dumps WHERE dump_id = %s " 90 | 91 | cursor.execute(query,(dump_id, )) 92 | row = cursor.fetchone() 93 | if not row: 94 | return 95 | 96 | (host,server,date) = row 97 | domain = util.reorder_domain(host) 98 | twold = util.reorder_domain(util.extract_twold(domain)) 99 | twold_like = '-NONE-' # avoids any matching in "pe.host LIKE %s" in the query below 100 | if twold is None: 101 | if not host is None: 102 | twold = host 103 | twold_like = twold + '.%' 104 | 105 | query = """ 106 | SELECT dump_id,pe.sha1,pe.host,pe.server,trusted_av_labels,num_av_labels 107 | FROM pe_dumps AS pe 108 | JOIN ped_vts_mapping AS pvm 109 | USING(dump_id) 110 | JOIN virus_total_scans AS vts 111 | USING(vt_id) 112 | WHERE pe.corrupt = 'f' AND 113 | (pe.host = %s OR pe.host LIKE %s OR pe.server = %s) AND 114 | pe.dump_id < %s AND pe.dump_id > %s AND 115 | pe.timestamp > %s """ 116 | 117 | cursor.execute(query,(host, twold_like, server, dump_id, dump_id-MAX_PAST_DUMPS, date-timedelta(days=MAX_PAST_DAYS))) 118 | tuples = cursor.fetchall() 119 | 120 | # make the results into a pandas data frame 121 | if not tuples: 122 | df = ps.DataFrame(index=[], columns=['dump_id','sha1','host','server','tavs','navs']) 123 | else: 124 | df = ps.DataFrame.from_records(tuples) 125 | df.columns = ['dump_id','sha1','host','server','tavs','navs'] 126 | 127 | ### compute twold-based features 128 | df_twold = df[df['host'].str.startswith(twold)==True] 129 | twold_v = compute_features_hts(df_twold) 130 | 131 | ### compute host-based features 132 | df_host = df[df.host == host] 133 | host_v = compute_features_hts(df_host) 134 | 135 | ### compute server-based features 136 | df_server = df[df.server == server] 137 | server_v = compute_features_hts(df_server) 138 | 139 | twold_features = (twold_v['benign_downloads'],twold_v['malware_downloads'], twold_v['suspicious_downloads'], 140 | twold_v['total_downloads'], twold_v['malware_ratio'], twold_v['suspicious_ratio'], twold_v['benign_ratio'], 141 | twold_v['avg_av_labels'], twold_v['avg_trusted_labels'], 142 | twold_v['unknown_hashes'], twold_v['total_hashes'], twold_v['unknown_hash_ratio']) 143 | 144 | host_features = (host_v['benign_downloads'],host_v['malware_downloads'], host_v['suspicious_downloads'], 145 | host_v['total_downloads'], host_v['malware_ratio'], host_v['suspicious_ratio'], host_v['benign_ratio'], 146 | host_v['avg_av_labels'], host_v['avg_trusted_labels'], 147 | host_v['unknown_hashes'], host_v['total_hashes'], host_v['unknown_hash_ratio']) 148 | 149 | server_features = (server_v['benign_downloads'],server_v['malware_downloads'], server_v['suspicious_downloads'], 150 | server_v['total_downloads'], server_v['malware_ratio'], server_v['suspicious_ratio'], server_v['benign_ratio'], 151 | server_v['avg_av_labels'], server_v['avg_trusted_labels'], 152 | server_v['unknown_hashes'], server_v['total_hashes'], server_v['unknown_hash_ratio']) 153 | 154 | 155 | query = """ UPDATE weka_features SET 156 | 157 | twold_benign_downloads = %s, 158 | twold_malware_downloads = %s, 159 | twold_suspicious_downloads = %s, 160 | twold_total_downloads = %s, 161 | twold_malware_ratio = %s, 162 | twold_suspicious_ratio = %s, 163 | twold_benign_ratio = %s, 164 | twold_avg_av_labels = %s, 165 | twold_avg_trusted_labels = %s, 166 | twold_unknown_hashes = %s, 167 | twold_total_hashes = %s, 168 | twold_unknown_hash_ratio = %s, 169 | 170 | host_benign_downloads = %s, 171 | host_malware_downloads = %s, 172 | host_suspicious_downloads = %s, 173 | host_total_downloads = %s, 174 | host_malware_ratio = %s, 175 | host_suspicious_ratio = %s, 176 | host_benign_ratio = %s, 177 | host_avg_av_labels = %s, 178 | host_avg_trusted_labels = %s, 179 | host_unknown_hashes = %s, 180 | host_total_hashes = %s, 181 | host_unknown_hash_ratio = %s, 182 | 183 | server_ip_benign_downloads = %s, 184 | server_ip_malware_downloads = %s, 185 | server_ip_suspicious_downloads = %s, 186 | server_ip_total_downloads = %s, 187 | server_ip_malware_ratio = %s, 188 | server_ip_suspicious_ratio = %s, 189 | server_ip_benign_ratio = %s, 190 | server_ip_avg_av_labels = %s, 191 | server_ip_avg_trusted_labels = %s, 192 | server_ip_unknown_hashes = %s, 193 | server_ip_total_hashes = %s, 194 | server_ip_unknown_hash_ratio = %s 195 | 196 | where dump_id = %s """ 197 | 198 | try: 199 | cursor.execute(query, twold_features + host_features + server_features + (dump_id,)) 200 | except Exception as e: 201 | print e 202 | print "Could not insert server-based features for the dump #", dump_id 203 | 204 | 205 | 206 | def insert_bgp_based_features(cursor, dump_id): 207 | 208 | cursor.execute(""" 209 | SELECT server from pe_dumps where dump_id = %s""", (dump_id, )) 210 | server = cursor.fetchone()[0] 211 | 212 | cursor.execute(""" 213 | select bgp_prefix from bgp2asn where bgp_prefix >> %s""", (server,)) 214 | row = cursor.fetchone() 215 | if row is not None: 216 | bgp_prefix = row[0] 217 | else: 218 | return 219 | 220 | cursor.execute(""" 221 | SELECT COUNT(DISTINCT dump_id) 222 | FROM pe_dumps AS pe 223 | WHERE pe.server << %s AND 224 | pe.corrupt = 'f' AND 225 | pe.dump_id < %s AND pe.dump_id > %s """, 226 | (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS)) 227 | bgp_total_downloads = cursor.fetchone()[0] 228 | 229 | # Disabled vt_month_shelf due to the 403 error from VT 230 | #cursor.execute(""" 231 | # SELECT count(distinct dump_id) from pe_dumps as pe JOIN 232 | # weka_features as f using (dump_id) 233 | # where f.raw_dump_num_av_labels = 0 and f.vt_month_shelf = 't' and 234 | # pe.server << %s and pe.dump_id < %s """, 235 | # (bgp_prefix, dump_id)) 236 | cursor.execute(""" 237 | SELECT COUNT(DISTINCT dump_id) 238 | FROM pe_dumps AS pe JOIN 239 | ped_vts_mapping AS pvm USING (dump_id), 240 | virus_total_scans AS vts 241 | WHERE vts.num_av_labels = 0 AND 242 | pe.corrupt = 'f' AND 243 | pe.server << %s AND 244 | pe.dump_id < %s AND pe.dump_id > %s AND 245 | vts.vt_id = pvm.vt_id""", 246 | (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS)) 247 | bgp_benign_downloads = cursor.fetchone()[0] 248 | 249 | cursor.execute(""" 250 | SELECT COUNT(DISTINCT dump_id) 251 | FROM pe_dumps AS pe JOIN 252 | ped_vts_mapping AS pvm USING (dump_id), 253 | virus_total_scans AS vts 254 | WHERE vts.trusted_av_labels > 1 AND 255 | pe.corrupt = 'f' AND 256 | pe.server << %s AND 257 | pe.dump_id < %s AND pe.dump_id > %s AND 258 | vts.vt_id = pvm.vt_id""", 259 | (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS)) 260 | bgp_malware_downloads = cursor.fetchone()[0] 261 | 262 | cursor.execute(""" 263 | SELECT COUNT(DISTINCT dump_id) 264 | FROM pe_dumps AS pe JOIN 265 | ped_vts_mapping AS pvm USING (dump_id), 266 | virus_total_scans AS vts 267 | WHERE vts.num_av_labels > 1 AND 268 | pe.corrupt = 'f' AND 269 | pe.server << %s AND 270 | pe.dump_id < %s AND pe.dump_id > %s AND 271 | vts.vt_id = pvm.vt_id""", 272 | (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS)) 273 | bgp_suspicious_downloads = cursor.fetchone()[0] 274 | 275 | if bgp_total_downloads == 0: 276 | bgp_benign_ratio = None 277 | bgp_malware_ratio = None 278 | bgp_suspicious_ratio = None 279 | else: 280 | bgp_benign_ratio = float(bgp_benign_downloads) / bgp_total_downloads 281 | bgp_malware_ratio = float(bgp_malware_downloads) / bgp_total_downloads 282 | bgp_suspicious_ratio = float(bgp_suspicious_downloads) / bgp_total_downloads 283 | 284 | # The averages are over distinct sha1s 285 | cursor.execute(""" 286 | SELECT AVG(num_av_labels), AVG(trusted_av_labels) 287 | FROM 288 | (SELECT pe.sha1, MAX(dump_id) AS max_id 289 | FROM pe_dumps AS pe 290 | WHERE pe.server << %s AND 291 | pe.dump_id < %s AND pe.dump_id > %s AND 292 | pe.corrupt = 'f' GROUP BY pe.sha1) as a 293 | JOIN 294 | (SELECT p.sha1, num_av_labels, trusted_av_labels, dump_id 295 | FROM pe_dumps AS p JOIN 296 | ped_vts_mapping as pvm USING (dump_id), 297 | virus_total_scans as vts 298 | WHERE pvm.vt_id = vts.vt_id AND 299 | p.server << %s AND 300 | dump_id < %s AND dump_id > %s AND 301 | p.corrupt='f') as b 302 | ON a.max_id = b.dump_id 303 | WHERE num_av_labels IS NOT NULL""", 304 | (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS, bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS)) 305 | if cursor.rowcount == 0: 306 | bgp_avg_av_labels = None 307 | bgp_avg_trusted_labels = None 308 | else: 309 | bgp_avg_av_labels, bgp_avg_trusted_labels = cursor.fetchone() 310 | 311 | # the oldest scan report is used to get the # of unknown hashes 312 | # to remove any bias due to VT submissions 313 | cursor.execute(""" 314 | SELECT COUNT(DISTINCT b.sha1) 315 | FROM 316 | (SELECT pe.sha1, MIN(dump_id) AS min_id 317 | FROM pe_dumps AS pe 318 | WHERE pe.server << %s AND 319 | pe.dump_id < %s AND pe.dump_id > %s AND 320 | pe.corrupt = 'f' GROUP BY pe.sha1) as a 321 | JOIN 322 | (SELECT p.sha1, num_av_labels, trusted_av_labels, dump_id 323 | FROM pe_dumps AS p JOIN 324 | ped_vts_mapping as pvm USING (dump_id), 325 | virus_total_scans as vts 326 | WHERE pvm.vt_id = vts.vt_id AND 327 | p.server << %s AND 328 | dump_id < %s AND dump_id > %s AND 329 | p.corrupt='f') as b 330 | ON a.min_id = b.dump_id 331 | WHERE num_av_labels IS NULL""", 332 | (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS, bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS)) 333 | bgp_unknown_hashes = cursor.fetchone()[0] 334 | 335 | cursor.execute(""" 336 | SELECT COUNT(DISTINCT pe.sha1) 337 | FROM pe_dumps AS pe 338 | WHERE pe.server << %s AND 339 | pe.corrupt = 'f' AND 340 | pe.dump_id < %s AND pe.dump_id > %s """, 341 | (bgp_prefix, dump_id, dump_id-MAX_PAST_DUMPS)) 342 | bgp_total_hashes = cursor.fetchone()[0] 343 | if bgp_total_hashes != 0: 344 | bgp_unknown_hash_ratio = float(bgp_unknown_hashes) / bgp_total_hashes 345 | else: 346 | bgp_unknown_hash_ratio = None 347 | try: 348 | cursor.execute(""" 349 | UPDATE weka_features set bgp_benign_downloads = %s, 350 | bgp_malware_downloads = %s, 351 | bgp_suspicious_downloads = %s, 352 | bgp_total_downloads = %s, 353 | bgp_malware_ratio = %s, 354 | bgp_suspicious_ratio = %s, 355 | bgp_benign_ratio = %s, 356 | bgp_avg_av_labels = %s, 357 | bgp_avg_trusted_labels = %s, 358 | bgp_unknown_hashes = %s, 359 | bgp_total_hashes = %s, 360 | bgp_unknown_hash_ratio = %s 361 | where dump_id = %s """, 362 | (bgp_benign_downloads, bgp_malware_downloads, 363 | bgp_suspicious_downloads, 364 | bgp_total_downloads, bgp_malware_ratio, 365 | bgp_suspicious_ratio, 366 | bgp_benign_ratio, 367 | bgp_avg_av_labels, bgp_avg_trusted_labels, 368 | bgp_unknown_hashes, bgp_total_hashes, 369 | bgp_unknown_hash_ratio, dump_id)) 370 | except: 371 | print "Could not insert bgp based features for the dump #", dump_id 372 | 373 | 374 | def insert_hash_based_features(cursor, dump_id): 375 | cursor.execute("""select sha1 from pe_dumps where dump_id = %s""", 376 | (dump_id, )) 377 | sha1 = cursor.fetchone()[0] 378 | if sha1 is None: 379 | return 380 | cursor.execute(""" 381 | SELECT EXTRACT(EPOCH FROM (MAX(timestamp) - MIN(timestamp))), 382 | COUNT(DISTINCT pe.dump_id) 383 | FROM pe_dumps AS pe 384 | WHERE pe.dump_id < %s AND pe.dump_id > %s AND 385 | pe.corrupt = 'f' AND 386 | pe.sha1 = %s AND 387 | pe.corrupt = 'f' """, 388 | (dump_id, dump_id-MAX_PAST_DUMPS, sha1)) 389 | hash_life_time, num_dumps_with_same_hash = cursor.fetchone() 390 | 391 | if hash_life_time is None: 392 | hash_life_time = 0 393 | if num_dumps_with_same_hash is None: 394 | num_dumps_with_same_hash = 0 395 | 396 | cursor.execute(""" 397 | UPDATE weka_features 398 | SET hash_life_time = %s, 399 | num_dumps_with_same_hash = %s 400 | WHERE dump_id = %s""", 401 | (hash_life_time, num_dumps_with_same_hash, dump_id)) 402 | 403 | cursor.execute(""" 404 | SELECT count(*) FROM 405 | (SELECT DISTINCT client, 406 | DATE_TRUNC('DAY', timestamp) 407 | FROM pe_dumps AS pe 408 | WHERE pe.dump_id < %s AND pe.dump_id > %s AND 409 | pe.corrupt='f' AND 410 | pe.sha1 = %s) AS a""", 411 | (dump_id, dump_id-MAX_PAST_DUMPS, sha1)) 412 | estimated_clients_with_same_hash = cursor.fetchone()[0] 413 | 414 | cursor.execute(""" 415 | SELECT AVG(count) 416 | FROM 417 | (SELECT client, 418 | date_trunc('day', timestamp) AS ts, 419 | COUNT(*) 420 | FROM pe_dumps AS pe 421 | WHERE pe.dump_id < %s AND pe.dump_id > %s AND 422 | pe.corrupt='f' AND 423 | pe.sha1 = %s 424 | GROUP BY client, ts) AS b""", 425 | (dump_id, dump_id-MAX_PAST_DUMPS, sha1)) 426 | hash_daily_dump_rate_per_client = cursor.fetchone()[0] 427 | 428 | cursor.execute(""" 429 | UPDATE weka_features 430 | SET estimated_clients_with_same_hash = %s, 431 | hash_daily_dump_rate_per_client = %s 432 | WHERE dump_id = %s""", 433 | (estimated_clients_with_same_hash, hash_daily_dump_rate_per_client, 434 | dump_id)) 435 | 436 | 437 | def insert_download_request_features(cursor, dump_id): 438 | cursor.execute(""" 439 | SELECT * 440 | FROM pe_dumps 441 | WHERE dump_id = %s AND 442 | referer IS null""", 443 | (dump_id,)) 444 | if cursor.rowcount == 1: 445 | referer_exists = 0 446 | else: 447 | referer_exists = 1 448 | 449 | # update weka_features as wf set host_name_exists=0 from pe_dumps as pe 450 | # where pe.dump_id = wf.dump_id and host SIMILAR TO 451 | # '[0-9]+.[0-9]+.[0-9]+.[0-9]+' 452 | cursor.execute(""" 453 | SELECT * 454 | FROM pe_dumps 455 | WHERE dump_id = %s AND 456 | host = SUBSTRING(CAST(server AS TEXT) FROM '(.*)/32')""", 457 | (dump_id,)) 458 | if cursor.rowcount == 0: 459 | host_name_exists = 1 460 | else: 461 | host_name_exists = 0 462 | 463 | cursor.execute(""" 464 | UPDATE weka_features 465 | SET referer_exists = %s, 466 | host_name_exists = %s 467 | WHERE dump_id = %s""", 468 | (referer_exists, host_name_exists, dump_id)) 469 | 470 | # Once we generalize to file types beyond PE files, the extension_class feature should probably be removed 471 | common_ext = ['exe', 'dll', 'msi', 'jar', 'dmg', 'apk'] # executable files extensions... 472 | common_fake = ['html', 'gif', 'jpg', 'jpeg', 'txt', 'pdf', 'htm'] 473 | other_ext = ['php', 'aspx', 'asp'] 474 | 475 | cursor.execute(""" 476 | SELECT url 477 | FROM pe_dumps 478 | WHERE dump_id = %s""", 479 | (dump_id,)) 480 | url = cursor.fetchone()[0] 481 | if url is not None: 482 | ext = util.extract_extension(url) 483 | if ext is not None: 484 | ext = ext[:10] 485 | 486 | if ext is None: 487 | ext_class = "no_ext" 488 | elif ext in common_ext: 489 | ext_class = "common_ext" 490 | elif ext in common_fake: 491 | ext_class = "common_fake" 492 | elif ext in other_ext: 493 | ext_class = "other_ext" 494 | else: 495 | ext_class = "unknown_ext" 496 | #print "url:", url 497 | #print "extension:", ext 498 | else: 499 | ext_class = "no_url" 500 | ext = None 501 | cursor.execute(""" 502 | UPDATE weka_features 503 | SET extension_class = %s 504 | WHERE dump_id = %s""", 505 | (ext_class, dump_id)) 506 | 507 | cursor.execute(""" 508 | SELECT CHAR_LENGTH(url), url 509 | FROM pe_dumps 510 | WHERE dump_id = %s""", 511 | (dump_id,)) 512 | row = cursor.fetchone() 513 | url_length = None 514 | if row is not None: 515 | url_length = row[0] 516 | url = row[1] 517 | if url is not None: 518 | url_path = url.split('?')[0] 519 | directory_depth = url_path.count('/') 520 | else: 521 | url_length = 0 522 | directory_depth = 0 523 | 524 | cursor.execute(""" 525 | UPDATE weka_features SET 526 | url_length = %s, 527 | directory_depth = %s 528 | WHERE dump_id = %s""", 529 | (url_length, directory_depth, dump_id)) 530 | 531 | 532 | def insert_url_features(cursor, dump_id): 533 | # cursor.execute("SELECT ") 534 | cursor.execute("SELECT url from pe_dumps where dump_id = %s", (dump_id,)) 535 | url = cursor.fetchone()[0] 536 | cursor.execute(""" 537 | SELECT COUNT(DISTINCT dump_id) 538 | FROM pe_dumps AS pe JOIN 539 | ped_vts_mapping AS pvm USING (dump_id), 540 | virus_total_scans AS vts 541 | WHERE vts.trusted_av_labels > 1 AND 542 | pe.corrupt = 'f' AND 543 | pe.url = %s AND 544 | pe.dump_id < %s AND pe.dump_id > %s AND 545 | pvm.vt_id = vts.vt_id """, 546 | (url, dump_id, dump_id-MAX_PAST_DUMPS)) 547 | url_malware_downloads = cursor.fetchone()[0] 548 | 549 | cursor.execute(""" 550 | SELECT COUNT(DISTINCT dump_id) 551 | FROM pe_dumps AS pe 552 | WHERE pe.url = %s AND 553 | pe.corrupt = 'f' AND 554 | pe.dump_id < %s AND pe.dump_id > %s """, 555 | (url, dump_id, dump_id-MAX_PAST_DUMPS)) 556 | url_total_downloads = cursor.fetchone()[0] 557 | 558 | cursor.execute(""" 559 | SELECT COUNT(DISTINCT pe.sha1) 560 | FROM pe_dumps AS pe 561 | WHERE pe.url = %s AND 562 | pe.corrupt = 'f' AND 563 | pe.dump_id < %s AND pe.dump_id > %s AND 564 | pe.corrupt='f' """, 565 | (url, dump_id, dump_id-MAX_PAST_DUMPS)) 566 | url_distinct_sha1s = cursor.fetchone()[0] 567 | 568 | cursor.execute(""" 569 | UPDATE weka_features 570 | SET url_malware_downloads = %s, 571 | url_total_downloads = %s, 572 | url_distinct_sha1s = %s 573 | WHERE dump_id = %s """, 574 | (url_malware_downloads, url_total_downloads, 575 | url_distinct_sha1s, dump_id)) 576 | 577 | 578 | def get_url_struct_matches(cursor, url_struct, dump_id): 579 | # escaping special regex characters 580 | replace = [ 581 | ('.', '\.'), ('+', '\+'), ('?', '\?'), 582 | ('{', '\{'), ('}', '\}'), ('[', '\]'), 583 | ('[', '\]'), ('^', '\^'), ('$', '\$') 584 | ] 585 | for pair in replace: 586 | url_struct = url_struct.replace(pair[0], pair[1]) 587 | # the structure should be a matched to the whole query path 588 | url_struct = '^.*\?' + url_struct + '$' 589 | #print "The formatted url_struct: %s" % (url_struct,) 590 | cursor.execute(""" 591 | SELECT COUNT(DISTINCT dump_id) 592 | FROM pe_dumps AS pe JOIN 593 | ped_vts_mapping AS pvm USING (dump_id), 594 | virus_total_scans AS vts 595 | WHERE vts.trusted_av_labels > 1 AND 596 | pvm.vt_id = vts.vt_id AND 597 | pe.corrupt = 'f' AND 598 | pe.url ~ %s AND 599 | pe.dump_id < %s AND pe.dump_id > %s """, 600 | (url_struct, dump_id, dump_id-MAX_PAST_DUMPS)) 601 | url_struct_malware_downloads = cursor.fetchone()[0] 602 | 603 | cursor.execute(""" 604 | SELECT COUNT(DISTINCT dump_id) 605 | FROM pe_dumps AS pe 606 | WHERE pe.url ~ %s AND 607 | pe.corrupt = 'f' AND 608 | pe.dump_id < %s AND pe.dump_id > %s """, 609 | (url_struct, dump_id, dump_id-MAX_PAST_DUMPS)) 610 | url_struct_total_downloads = cursor.fetchone()[0] 611 | 612 | cursor.execute(""" 613 | SELECT COUNT(DISTINCT pe.sha1) 614 | FROM pe_dumps AS pe 615 | WHERE pe.url ~ %s AND 616 | pe.dump_id < %s AND pe.dump_id > %s AND 617 | pe .corrupt='f' """, 618 | (url_struct, dump_id, dump_id-MAX_PAST_DUMPS)) 619 | url_struct_distinct_sha1s = cursor.fetchone()[0] 620 | return (url_struct_malware_downloads, url_struct_total_downloads, 621 | url_struct_distinct_sha1s) 622 | 623 | 624 | def insert_url_struct_matches(cursor, pmd, ptd, pds, dump_id): 625 | sql_query = "UPDATE weka_features " \ 626 | "SET url_struct_malware_downloads = %s, " \ 627 | "url_struct_total_downloads = %s, " \ 628 | "url_struct_distinct_sha1s = %s " \ 629 | "WHERE dump_id = %s" % \ 630 | (pmd, ptd, pds, dump_id) 631 | cursor.execute(sql_query) 632 | 633 | 634 | def insert_url_struct_features(cursor, dump_id): 635 | cursor.execute(""" 636 | SELECT url from pe_dumps where dump_id = %s""", (dump_id,)) 637 | url = cursor.fetchone() 638 | if url is None: 639 | return 640 | 641 | url = url[0] 642 | if url is None: 643 | return 644 | #print "The url is: ", url 645 | #print "Dump_id is ", dump_id 646 | #print "The parsed result is:", urlparse.urlparse(url) 647 | parsed_url = urlparse.urlparse(url) 648 | path = parsed_url.path 649 | #print "Path: ", path 650 | query = parsed_url.query 651 | query_list = urlparse.parse_qsl(query, keep_blank_values=True) 652 | #print "The parsed query is:",query_list 653 | 654 | #print "Query is: %s" % query 655 | m = re.search('([^\w]*)([\w]+)([^\w]+)(.*)', query) 656 | if m is None: 657 | print "No url_struct found!" 658 | return 659 | first_exp = m.group(1) 660 | word = m.group(2) 661 | divide = m.group(3) 662 | rest = m.group(4) 663 | url_struct = None 664 | if first_exp is not None: 665 | url_struct = first_exp 666 | if rest is not None: 667 | url_struct += "\w*" + divide 668 | while True: 669 | m = re.search('([\w]+)([^\w]+)?(.*)', rest) 670 | if m is not None: 671 | word = m.group(1) 672 | divide = m.group(2) 673 | #if '.' in divide: 674 | #print "divide:", divide 675 | rest = m.group(3) 676 | if divide: 677 | url_struct += "\w*" + divide 678 | else: 679 | url_struct += "\w*" 680 | else: break 681 | 682 | #print "url_struct :", url_struct 683 | if len(url_struct) < 10: 684 | print "url_struct pattern length too short:%s, " % len(url_struct), url_struct 685 | return 686 | 687 | pmd, ptd, pds = get_url_struct_matches(cursor, url_struct, dump_id) 688 | print "Number of url_struct matching dumps: %s/%s" % (pmd,ptd) 689 | insert_url_struct_matches(cursor, pmd, ptd, pds, dump_id) 690 | 691 | 692 | # TODO: currently file_type is not used. 693 | # We will need to see if some of the features should be modified 694 | # to take the file_type into account 695 | def insert_features(cursor, dump_id, file_type): 696 | print "the dump_id is:", dump_id 697 | cursor.execute(""" 698 | DELETE FROM weka_features 699 | WHERE dump_id = %s 700 | """, (dump_id,)) 701 | cursor.execute(""" 702 | INSERT INTO weka_features (dump_id, corrupt, sha1, host) 703 | (SELECT pe.dump_id, pe.corrupt, pe.sha1, pe.host 704 | FROM pe_dumps AS pe 705 | WHERE pe.dump_id = %s )""", 706 | (dump_id,)) 707 | #print "Inserted dump_id", cursor.fetchone()[0] 708 | 709 | insert_hts_based_features(cursor, dump_id) 710 | 711 | insert_bgp_based_features(cursor, dump_id) 712 | insert_hash_based_features(cursor, dump_id) 713 | insert_download_request_features(cursor, dump_id) 714 | insert_url_features(cursor, dump_id) 715 | try: 716 | insert_url_struct_features(cursor, dump_id) 717 | except psycopg2.DataError as e: 718 | print "Exception in inserting url_struct features for %s dump_id" % (dump_id,) 719 | print e 720 | 721 | 722 | def get_feature_vector(dump_id, file_type): 723 | #print "entered get_feature_vector" 724 | conn = util.connect_to_db() 725 | cursor = conn.cursor() 726 | insert_features(cursor, dump_id, file_type) 727 | print "Done inserting features for dump_id: ", dump_id 728 | 729 | if __name__ == "__main__": 730 | if len(sys.argv) == 3: 731 | get_feature_vector(int(sys.argv[1]),sys.argv[2]) 732 | else: 733 | print "Incorrect number of arguments!!" 734 | --------------------------------------------------------------------------------