├── .gitignore
├── README.md
└── mwcrawler.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | 
16 | # Installer logs
17 | pip-log.txt
18 | 
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 | 
23 | #Translations
24 | *.mo
25 | 
26 | #Mr Developer
27 | .mr.developer.cfg
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | mwcrawler
 2 | =========
 3 | 
 4 | mwcrawler is a simple python script that parses malicious url lists from well 
 5 | known websites (i.e. MDL, Malc0de) in order to automatically download
 6 | the malicious code. It can be used to populate malware repositories or zoos.
 7 | 
 8 | Currently the script parses the following sources:
 9 | - NovCon Minotaur:
10 | 	http://minotauranalysis.com/malwarelist-urls.aspx
11 | - Malware Domain List:
12 | 	http://www.malwaredomainlist.com/hostslist/mdl.xml
13 | - VX Vault:
14 | 	http://vxvault.siri-urz.net/URL_List.php
15 | - Malc0de:
16 | 	http://malc0de.com/rss
17 | - Malware Black List:
18 | 	http://www.malwareblacklist.com/mbl.xml
19 | - Sacour.cn:
20 | 	http://www.sacour.cn
21 | 
22 | The downloaded content is stored in /opt/malware/unsorted/ by default, so you 
23 | need to create this folder first, or change the source code otherwise.
24 | Sub-folders will be created, based on the magic numbers of the downloaded
25 | content (i.e. PE32, PDF, ZIP). For the sake of simplicity note that the script
26 | splits the file description string and only use the first 'token'.
27 | 
28 | The file name is set based on the calculated MD5 hash, which is also used to
29 | check if the file exists, thus avoiding duplicate entries in the directories.
30 | Please note that the original file name (set in the url or http header) is 
31 | ignored.
32 | 
33 | Additionally if you have Angelo Dell'Aera's *thug* installed, you can enable 
34 | html code for low interaction analysis.
35 | 
36 | 
37 | Requirements:
38 | 
39 | - BeautifulSoup 3.0.8 (later versions seem to have problems parsing html):
40 | 	http://www.crummy.com/software/BeautifulSoup/
41 | 
42 | 
43 | Usage:
44 | 
45 | $ python mwcrawler.py
46 | 
47 | Use '-t' for thug analysis
48 | $ python mwcrawler.py -t
49 | 
50 | 
51 | References:
52 | 
53 | thug repository - http://github.com/buffer/thug
54 | 


--------------------------------------------------------------------------------
/mwcrawler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Copyright (C) 2012 Ricardo Dias
  3 | #
  4 | # Malware Crawler Module v0.4
  5 | #
  6 | # This program is free software: you can redistribute it and/or modify
  7 | # it under the terms of the GNU General Public License as published by
  8 | # the Free Software Foundation, either version 3 of the License, or
  9 | # (at your option) any later version.
 10 | #
 11 | # This program is distributed in the hope that it will be useful,
 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | # GNU General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 18 | #
 19 | # Requirements:
 20 | # - BeautifulSoup 3.0.8
 21 | 
 22 | from BeautifulSoup import BeautifulSoup as bs
 23 | import sys
 24 | import hashlib
 25 | import re
 26 | import urllib2
 27 | import magic
 28 | import os 
 29 | import socket
 30 | import datetime
 31 | 
 32 | # By default thug analyis is disabled
 33 | isthug	= False
 34 | 
 35 | # variable for date value manipulation
 36 | now		= datetime.datetime.now()
 37 | str(now)
 38 | 
 39 | # maximum wait time of http gets
 40 | timeout	= 15
 41 | socket.setdefaulttimeout(timeout)
 42 | 
 43 | # load thug function, also checks if thug is installed
 44 | def loadthug():
 45 | 	try:
 46 | 		sys.path.append('/opt/thug/src')
 47 | 		import thug
 48 | 		isthug = True
 49 | 		print "- Thug module loaded for html analysis"
 50 | 	except ImportError:
 51 | 		print "- No Thug module found, html code inspection won't be available"
 52 | 
 53 | # determine file type for correct archival
 54 | def gettype(file):
 55 | 	ms = magic.open(magic.MAGIC_NONE)
 56 | 	ms.load()
 57 | 	return ms.buffer(file)
 58 | 
 59 | # beautifulsoup parser
 60 | def parse(url):
 61 | 	request = urllib2.Request(url)
 62 | 	request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)')
 63 | 	try:
 64 | 		http = bs(urllib2.urlopen(request))
 65 | 	except:
 66 | 		print "- Error parsing %s" % (url)
 67 | 		return
 68 | 	return http
 69 | 
 70 | def decisor(url):
 71 | 	if not re.match('http',url):
 72 | 		url = 'http://'+url
 73 | 
 74 | 	try:
 75 | 		url_dl = urllib2.urlopen(url).read()
 76 | 	except Exception, e:
 77 | 		#print "-- Error: %s" % e
 78 | 		return
 79 | 
 80 | 	filetype = gettype(url_dl).split(' ')[0]
 81 | 	md5      = hashlib.md5(url_dl).hexdigest()
 82 | 
 83 | 	if (filetype == 'HTML'):
 84 | 		if isthug:
 85 | 			print "-- Thug candidate: HTML code in %s" % url
 86 | 
 87 | 			try:
 88 | 				thug.Thug([url])()
 89 | 			except Exception, e:
 90 | 				print "- Thug error: %s" % e
 91 | 				return
 92 | 
 93 | 	else:
 94 | 		dest = '/opt/malware/unsorted/'+filetype
 95 | 		fpath = dest+'/'+str(md5)
 96 | 
 97 | 		if not os.path.exists(dest):
 98 | 			os.makedirs(dest)
 99 | 
100 | 		if not os.path.exists(fpath):
101 | 			file = open(fpath, 'wb')
102 | 			file.write(url_dl)
103 | 			file.close
104 | 			print "-- Saved file type %s with md5: %s" % (filetype,md5)
105 | 
106 | def malwaredl(soup):
107 | 	print "- Fetching from Malware Domain List"
108 | 	mdl=[]
109 | 	for row in soup('description'):
110 | 		mdl.append(row)
111 | 	del mdl[0]
112 | 	mdl_sites=[]
113 | 	for row in mdl:
114 | 		site = re.sub('&amp;','&',str(row).split()[1]).replace(',','')
115 | 		if site == '-':
116 | 			mdl_sites.append(re.sub('&amp;','&',str(row).split()[4]).replace(',',''))
117 | 		else:
118 | 			mdl_sites.append(site)
119 | 	print "-- Found %s urls" % len(mdl)
120 | 	for row in mdl_sites:
121 | 		decisor(row)
122 | 
123 | def vxvault(soup):
124 | 	print "- Fetching from VXVault"
125 | 	vxv=[]
126 | 	for row in soup('pre'):
127 | 		vxv = row.string.split('\r\n')
128 | 	del vxv[:4]
129 | 	del vxv[-1]
130 | 	print "-- Found %s urls" % len(vxv)
131 | 	for row in vxv:
132 | 		decisor(row)
133 | 
134 | def malc0de(soup):
135 | 	print "- Fetching from Malc0de"
136 | 	mlc=[]
137 | 	for row in soup('description'):
138 | 		mlc.append(row)
139 | 	del mlc[0]
140 | 	mlc_sites=[]
141 | 	for row in mlc:
142 | 		site = re.sub('&amp;','&',str(row).split()[1]).replace(',','')
143 | 		mlc_sites.append(site)
144 | 	print "-- Found %s urls" % len(mlc_sites)
145 | 	for row in mlc_sites:
146 | 		decisor(row)
147 | 
148 | def malwarebl(soup):
149 | 	print "- Fetching from Malware Black List"
150 | 	mbl=[]
151 | 	for row in soup('description'):
152 | 		site = str(row).split()[1].replace(',','')
153 | 		mbl.append(site)
154 | 	print "-- Found %s urls" % len(mbl)
155 | 	for row in mbl:
156 | 		decisor(row)
157 | 
158 | def minotaur(soup):
159 | 	print "- Fetching from NovCon Minotaur"
160 | 	min=[]
161 | 	for row in soup('td'):
162 | 		try:
163 | 			if re.match('http',row.string):
164 | 				min.append(row.string)
165 | 		except:
166 | 			pass
167 | 	print "-- Found %s urls" % len(min)
168 | 	for row in min: 
169 | 		decisor(row)
170 | 
171 | def sacour(soup):
172 | 	print "- Fetching from Sacour.cn"
173 | 	for url in soup('a'):
174 | 		min=[]
175 | 		if re.match('list/',url['href']):
176 | 			suburl = parse('http://www.sacour.cn/'+url['href'])
177 | 			for text in suburl('body'):
178 | 				for urls in text.contents:
179 | 					if re.match('http://',str(urls)):
180 | 						min.append(str(urls))
181 | 		if len(min) > 0:
182 | 			print "-- Found %s urls in %s" % (len(min),url['href'])
183 | 			for row in min:
184 | 				decisor(row)
185 | 
186 | if __name__ == "__main__":
187 | 	print "Malware Parser v0.4"
188 | 
189 | 	try:
190 | 		if sys.argv[1] == '-t':
191 | 			loadthug()
192 | 	except:
193 | 		print "- Thug analysis not enabled (use -t to enable thug)"
194 | 
195 | 	#source list
196 | 	minotaur(parse('http://minotauranalysis.com/malwarelist-urls.aspx'))
197 | 	malwaredl(parse('http://www.malwaredomainlist.com/hostslist/mdl.xml'))
198 | 	vxvault(parse('http://vxvault.siri-urz.net/URL_List.php'))
199 | 	malc0de(parse('http://malc0de.com/rss'))
200 | 	malwarebl(parse('http://www.malwareblacklist.com/mbl.xml'))
201 | 	sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year)))


--------------------------------------------------------------------------------