├── .gitignore ├── README.md └── mwcrawler.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mwcrawler 2 | ========= 3 | 4 | mwcrawler is a simple python script that parses malicious url lists from well 5 | known websites (i.e. MDL, Malc0de) in order to automatically download 6 | the malicious code. It can be used to populate malware repositories or zoos. 7 | 8 | Currently the script parses the following sources: 9 | - NovCon Minotaur: 10 | http://minotauranalysis.com/malwarelist-urls.aspx 11 | - Malware Domain List: 12 | http://www.malwaredomainlist.com/hostslist/mdl.xml 13 | - VX Vault: 14 | http://vxvault.siri-urz.net/URL_List.php 15 | - Malc0de: 16 | http://malc0de.com/rss 17 | - Malware Black List: 18 | http://www.malwareblacklist.com/mbl.xml 19 | - Sacour.cn: 20 | http://www.sacour.cn 21 | 22 | The downloaded content is stored in /opt/malware/unsorted/ by default, so you 23 | need to create this folder first, or change the source code otherwise. 24 | Sub-folders will be created, based on the magic numbers of the downloaded 25 | content (i.e. PE32, PDF, ZIP). For the sake of simplicity note that the script 26 | splits the file description string and only use the first 'token'. 27 | 28 | The file name is set based on the calculated MD5 hash, which is also used to 29 | check if the file exists, thus avoiding duplicate entries in the directories. 30 | Please note that the original file name (set in the url or http header) is 31 | ignored. 32 | 33 | Additionally if you have Angelo Dell'Aera's *thug* installed, you can enable 34 | html code for low interaction analysis. 35 | 36 | 37 | Requirements: 38 | 39 | - BeautifulSoup 3.0.8 (later versions seem to have problems parsing html): 40 | http://www.crummy.com/software/BeautifulSoup/ 41 | 42 | 43 | Usage: 44 | 45 | $ python mwcrawler.py 46 | 47 | Use '-t' for thug analysis 48 | $ python mwcrawler.py -t 49 | 50 | 51 | References: 52 | 53 | thug repository - http://github.com/buffer/thug 54 | -------------------------------------------------------------------------------- /mwcrawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Copyright (C) 2012 Ricardo Dias 3 | # 4 | # Malware Crawler Module v0.4 5 | # 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program. If not, see . 18 | # 19 | # Requirements: 20 | # - BeautifulSoup 3.0.8 21 | 22 | from BeautifulSoup import BeautifulSoup as bs 23 | import sys 24 | import hashlib 25 | import re 26 | import urllib2 27 | import magic 28 | import os 29 | import socket 30 | import datetime 31 | 32 | # By default thug analyis is disabled 33 | isthug = False 34 | 35 | # variable for date value manipulation 36 | now = datetime.datetime.now() 37 | str(now) 38 | 39 | # maximum wait time of http gets 40 | timeout = 15 41 | socket.setdefaulttimeout(timeout) 42 | 43 | # load thug function, also checks if thug is installed 44 | def loadthug(): 45 | try: 46 | sys.path.append('/opt/thug/src') 47 | import thug 48 | isthug = True 49 | print "- Thug module loaded for html analysis" 50 | except ImportError: 51 | print "- No Thug module found, html code inspection won't be available" 52 | 53 | # determine file type for correct archival 54 | def gettype(file): 55 | ms = magic.open(magic.MAGIC_NONE) 56 | ms.load() 57 | return ms.buffer(file) 58 | 59 | # beautifulsoup parser 60 | def parse(url): 61 | request = urllib2.Request(url) 62 | request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)') 63 | try: 64 | http = bs(urllib2.urlopen(request)) 65 | except: 66 | print "- Error parsing %s" % (url) 67 | return 68 | return http 69 | 70 | def decisor(url): 71 | if not re.match('http',url): 72 | url = 'http://'+url 73 | 74 | try: 75 | url_dl = urllib2.urlopen(url).read() 76 | except Exception, e: 77 | #print "-- Error: %s" % e 78 | return 79 | 80 | filetype = gettype(url_dl).split(' ')[0] 81 | md5 = hashlib.md5(url_dl).hexdigest() 82 | 83 | if (filetype == 'HTML'): 84 | if isthug: 85 | print "-- Thug candidate: HTML code in %s" % url 86 | 87 | try: 88 | thug.Thug([url])() 89 | except Exception, e: 90 | print "- Thug error: %s" % e 91 | return 92 | 93 | else: 94 | dest = '/opt/malware/unsorted/'+filetype 95 | fpath = dest+'/'+str(md5) 96 | 97 | if not os.path.exists(dest): 98 | os.makedirs(dest) 99 | 100 | if not os.path.exists(fpath): 101 | file = open(fpath, 'wb') 102 | file.write(url_dl) 103 | file.close 104 | print "-- Saved file type %s with md5: %s" % (filetype,md5) 105 | 106 | def malwaredl(soup): 107 | print "- Fetching from Malware Domain List" 108 | mdl=[] 109 | for row in soup('description'): 110 | mdl.append(row) 111 | del mdl[0] 112 | mdl_sites=[] 113 | for row in mdl: 114 | site = re.sub('&','&',str(row).split()[1]).replace(',','') 115 | if site == '-': 116 | mdl_sites.append(re.sub('&','&',str(row).split()[4]).replace(',','')) 117 | else: 118 | mdl_sites.append(site) 119 | print "-- Found %s urls" % len(mdl) 120 | for row in mdl_sites: 121 | decisor(row) 122 | 123 | def vxvault(soup): 124 | print "- Fetching from VXVault" 125 | vxv=[] 126 | for row in soup('pre'): 127 | vxv = row.string.split('\r\n') 128 | del vxv[:4] 129 | del vxv[-1] 130 | print "-- Found %s urls" % len(vxv) 131 | for row in vxv: 132 | decisor(row) 133 | 134 | def malc0de(soup): 135 | print "- Fetching from Malc0de" 136 | mlc=[] 137 | for row in soup('description'): 138 | mlc.append(row) 139 | del mlc[0] 140 | mlc_sites=[] 141 | for row in mlc: 142 | site = re.sub('&','&',str(row).split()[1]).replace(',','') 143 | mlc_sites.append(site) 144 | print "-- Found %s urls" % len(mlc_sites) 145 | for row in mlc_sites: 146 | decisor(row) 147 | 148 | def malwarebl(soup): 149 | print "- Fetching from Malware Black List" 150 | mbl=[] 151 | for row in soup('description'): 152 | site = str(row).split()[1].replace(',','') 153 | mbl.append(site) 154 | print "-- Found %s urls" % len(mbl) 155 | for row in mbl: 156 | decisor(row) 157 | 158 | def minotaur(soup): 159 | print "- Fetching from NovCon Minotaur" 160 | min=[] 161 | for row in soup('td'): 162 | try: 163 | if re.match('http',row.string): 164 | min.append(row.string) 165 | except: 166 | pass 167 | print "-- Found %s urls" % len(min) 168 | for row in min: 169 | decisor(row) 170 | 171 | def sacour(soup): 172 | print "- Fetching from Sacour.cn" 173 | for url in soup('a'): 174 | min=[] 175 | if re.match('list/',url['href']): 176 | suburl = parse('http://www.sacour.cn/'+url['href']) 177 | for text in suburl('body'): 178 | for urls in text.contents: 179 | if re.match('http://',str(urls)): 180 | min.append(str(urls)) 181 | if len(min) > 0: 182 | print "-- Found %s urls in %s" % (len(min),url['href']) 183 | for row in min: 184 | decisor(row) 185 | 186 | if __name__ == "__main__": 187 | print "Malware Parser v0.4" 188 | 189 | try: 190 | if sys.argv[1] == '-t': 191 | loadthug() 192 | except: 193 | print "- Thug analysis not enabled (use -t to enable thug)" 194 | 195 | #source list 196 | minotaur(parse('http://minotauranalysis.com/malwarelist-urls.aspx')) 197 | malwaredl(parse('http://www.malwaredomainlist.com/hostslist/mdl.xml')) 198 | vxvault(parse('http://vxvault.siri-urz.net/URL_List.php')) 199 | malc0de(parse('http://malc0de.com/rss')) 200 | malwarebl(parse('http://www.malwareblacklist.com/mbl.xml')) 201 | sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year))) --------------------------------------------------------------------------------