diff --git a/mwcrawler.py b/mwcrawler.py index 19d0d56..eb73a4d 100644 --- a/mwcrawler.py +++ b/mwcrawler.py @@ -17,185 +17,225 @@ # along with this program. If not, see . # # Requirements: -# - BeautifulSoup 3.0.8 +# - BeautifulSoup 3.0.8 (Upgraded to BeautifulSoup 4 ) + +from bs4 import BeautifulSoup as bs -from BeautifulSoup import BeautifulSoup as bs import sys import hashlib import re import urllib2 import magic -import os +import os import socket import datetime +UNSORTED_FOLDER = '/opt/malware/unsorted' +HTTP_TIMEOUT = 15 +THUG_PATH = '/opt/thug/src' + # By default thug analyis is disabled -isthug = False +isthug = False # variable for date value manipulation -now = datetime.datetime.now() +now = datetime.datetime.now() str(now) # maximum wait time of http gets -timeout = 15 -socket.setdefaulttimeout(timeout) +socket.setdefaulttimeout(HTTP_TIMEOUT) # load thug function, also checks if thug is installed def loadthug(): - try: - sys.path.append('/opt/thug/src') - import thug - isthug = True - print "- Thug module loaded for html analysis" - except ImportError: - print "- No Thug module found, html code inspection won't be available" + try: + sys.path.append(THUG_PATH) + import thug + + isthug = True + print "- Thug module loaded for html analysis" + except ImportError: + print "- No Thug module found, html code inspection won't be available" # determine file type for correct archival def gettype(file): - ms = magic.open(magic.MAGIC_NONE) - ms.load() - return ms.buffer(file) + ms = magic.open(magic.MAGIC_NONE) + ms.load() + return ms.buffer(file) # beautifulsoup parser def parse(url): - request = urllib2.Request(url) - request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)') - try: - http = bs(urllib2.urlopen(request)) - except: - print "- Error parsing %s" % (url) - return - return http - -def decisor(url): - if not re.match('http',url): - url = 'http://'+url - - try: - url_dl = urllib2.urlopen(url).read() - except Exception, e: - #print "-- Error: %s" % e - return + request = urllib2.Request(url) + request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)') + try: + http = bs(urllib2.urlopen(request)) + except: + print "- Error parsing %s" % (url) + return + return http - filetype = gettype(url_dl).split(' ')[0] - md5 = hashlib.md5(url_dl).hexdigest() - if (filetype == 'HTML'): - if isthug: - print "-- Thug candidate: HTML code in %s" % url +def decisor(url): + if not re.match('http', url): + url = 'http://' + url + + try: + url_dl = urllib2.urlopen(url).read() + except Exception, e: + #print "-- Error: %s" % e + return + + filetype = gettype(url_dl).split(' ')[0] + md5 = hashlib.md5(url_dl).hexdigest() + + if (filetype == 'HTML'): + if isthug: + print "-- Thug candidate: HTML code in %s" % url + + try: + thug.Thug([url])() + except Exception, e: + print "- Thug error: %s" % e + return + + else: + dest = UNSORTED_FOLDER + filetype + fpath = dest + '/' + str(md5) + + if not os.path.exists(dest): + os.makedirs(dest) + + if not os.path.exists(fpath): + file = open(fpath, 'wb') + file.write(url_dl) + file.close + print "\t\t [*] Saved file type %s with md5: %s" % (filetype, md5) + +# Process links from MalwareDomainList +# +# Modified for better performance and clarity ------------------------------------------------------------------ +def malwaredl(soup): + print "[+] Fetching from Malware Domain List" + mdl_sites = [] + for i in soup.find_all('item'): + site_url = (i.description.string).split()[1].replace(',', '') - try: - thug.Thug([url])() - except Exception, e: - print "- Thug error: %s" % e - return + if site_url != '-': + mdl_sites.append(site_url) + else: + mdl_sites.append((i.description.string).split()[4].replace(',', '')) - else: - dest = '/opt/malware/unsorted/'+filetype - fpath = dest+'/'+str(md5) + print "\t [+] Found %s urls" % len(mdl_sites) + for row in mdl_sites: + decisor(row) - if not os.path.exists(dest): - os.makedirs(dest) +# Process links from VXVault +# +# Modified for better performance and clarity ------------------------------------------------------------------ +def vxvault(soup): + print "[+] Fetching from VXVault" + vxv = [] + for row in soup('pre'): + vxv = row.string.split('\n') + vxv = vxv[4:] + print "\t [+] Found %s urls" % len(vxv) + for row in vxv: + decisor(row) + +# Process links from Malc0de +# +# Modified for better performance and clarity ------------------------------------------------------------------ +def malc0de(soup): + print "[+] Fetching from MalC0de List" + mlc_sites = [] + for i in soup.find_all('item'): + site_url = "http://" + re.sub('&', '&', i.description.string.split()[1]).replace(',', '') + mlc_sites.append(site_url) - if not os.path.exists(fpath): - file = open(fpath, 'wb') - file.write(url_dl) - file.close - print "-- Saved file type %s with md5: %s" % (filetype,md5) + print "\t [+] Found %s urls" % len(mlc_sites) + for row in mlc_sites: + decisor(row) -def malwaredl(soup): - print "- Fetching from Malware Domain List" - mdl=[] - for row in soup('description'): - mdl.append(row) - del mdl[0] - mdl_sites=[] - for row in mdl: - site = re.sub('&','&',str(row).split()[1]).replace(',','') - if site == '-': - mdl_sites.append(re.sub('&','&',str(row).split()[4]).replace(',','')) - else: - mdl_sites.append(site) - print "-- Found %s urls" % len(mdl) - for row in mdl_sites: - decisor(row) +# Process links from Malware Black List +# +# Modified for better performance and clarity ------------------------------------------------------------------ +def malwarebl(soup): + print "- Fetching from Malware Black List" + mbl_sites = [] -def vxvault(soup): - print "- Fetching from VXVault" - vxv=[] - for row in soup('pre'): - vxv = row.string.split('\r\n') - del vxv[:4] - del vxv[-1] - print "-- Found %s urls" % len(vxv) - for row in vxv: - decisor(row) + for i in soup.find_all('item'): + site_url = "http://" + re.sub('&', '&', i.description.string.split()[1]).replace(',', '') + mbl_sites.append(site_url) -def malc0de(soup): - print "- Fetching from Malc0de" - mlc=[] - for row in soup('description'): - mlc.append(row) - del mlc[0] - mlc_sites=[] - for row in mlc: - site = re.sub('&','&',str(row).split()[1]).replace(',','') - mlc_sites.append(site) - print "-- Found %s urls" % len(mlc_sites) - for row in mlc_sites: - decisor(row) + print "\t [+] Found %s urls" % len(mbl_sites) + for row in mbl_sites: + decisor(row) -def malwarebl(soup): - print "- Fetching from Malware Black List" - mbl=[] - for row in soup('description'): - site = str(row).split()[1].replace(',','') - mbl.append(site) - print "-- Found %s urls" % len(mbl) - for row in mbl: - decisor(row) def minotaur(soup): - print "- Fetching from NovCon Minotaur" - min=[] - for row in soup('td'): - try: - if re.match('http',row.string): - min.append(row.string) - except: - pass - print "-- Found %s urls" % len(min) - for row in min: - decisor(row) + print "- Fetching from NovCon Minotaur" + min = [] + for row in soup('td'): + try: + if re.match('http', row.string): + min.append(row.string) + except: + pass + print "-- Found %s urls" % len(min) + for row in min: + decisor(row) + def sacour(soup): - print "- Fetching from Sacour.cn" - for url in soup('a'): - min=[] - if re.match('list/',url['href']): - suburl = parse('http://www.sacour.cn/'+url['href']) - for text in suburl('body'): - for urls in text.contents: - if re.match('http://',str(urls)): - min.append(str(urls)) - if len(min) > 0: - print "-- Found %s urls in %s" % (len(min),url['href']) - for row in min: - decisor(row) + print "- Fetching from Sacour.cn" + for url in soup('a'): + min = [] + if re.match('list/', url['href']): + suburl = parse('http://www.sacour.cn/' + url['href']) + for text in suburl('body'): + for urls in text.contents: + if re.match('http://', str(urls)): + min.append(str(urls)) + if len(min) > 0: + print "-- Found %s urls in %s" % (len(min), url['href']) + for row in min: + decisor(row) + if __name__ == "__main__": - print "Malware Parser v0.4" - - try: - if sys.argv[1] == '-t': - loadthug() - except: - print "- Thug analysis not enabled (use -t to enable thug)" - - #source list - minotaur(parse('http://minotauranalysis.com/malwarelist-urls.aspx')) - malwaredl(parse('http://www.malwaredomainlist.com/hostslist/mdl.xml')) - vxvault(parse('http://vxvault.siri-urz.net/URL_List.php')) - malc0de(parse('http://malc0de.com/rss')) - malwarebl(parse('http://www.malwareblacklist.com/mbl.xml')) - sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year))) \ No newline at end of file + print "Malware Parser v0.4" + + try: + if sys.argv[1] == '-t': + loadthug() + except: + print "- Thug analysis not enabled (use -t to enable thug)" + + #source list + try: + minotaur(parse('http://minotauranalysis.com/malwarelist-urls.aspx')) + except: + print "[ERROR] Unexpected error processing Minotaur List" + + try: + malwaredl(parse('http://www.malwaredomainlist.com/hostslist/mdl.xml')) + except: + print "[ERROR] Unexpected error processing Malware Domain List" + + try: + vxvault(parse('http://vxvault.siri-urz.net/URL_List.php')) + except: + print "[ERROR] Unexpected error processing VXVault List" + + try: + malc0de(parse('http://malc0de.com/rss')) + except: + print "[ERROR] Unexpected error processing Malc0de List" + + try: + malwarebl(parse('http://www.malwareblacklist.com/mbl.xml')) + except: + print "[ERROR] Unexpected error processing Malware Black List" + + try: + sacour(parse('http://www.sacour.cn/showmal.asp?month=%d&year=%d' % (now.month, now.year))) + except: + print "[ERROR] Unexpected error processing Sacour List"