diff --git a/README.md b/README.md index fdc0b40..b4476b0 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,28 @@ -mwcrawler -========= +mwcrawler .09 +============= mwcrawler is a simple python script that parses malicious url lists from well known websites (i.e. MDL, Malc0de) in order to automatically download the malicious code. It can be used to populate malware repositories or zoos. +The latest release of mwcrawler is maintained by and updated Francisco Donoso. The original author is Richardo Diaz: +https://github.com/0day1day/mwcrawler + Currently the script parses the following sources: -- NovCon Minotaur: - http://minotauranalysis.com/malwarelist-urls.aspx - Malware Domain List: http://www.malwaredomainlist.com/hostslist/mdl.xml - VX Vault: http://vxvault.siri-urz.net/URL_List.php - Malc0de: http://malc0de.com/rss -- Malware Black List: - http://www.malwareblacklist.com/mbl.xml -- Sacour.cn: - http://www.sacour.cn +- ThreatGlass: + http://threatglass.com +- CleanMX: + http://support.clean-mx.de/clean-mx/viruses +- Zeus Tracker: + https://zeustracker.abuse.ch -The downloaded content is stored in /opt/malware/unsorted/ by default, so you -need to create this folder first, or change the source code otherwise. +The downloaded content is stored in /opt/malware/unsorted/ by default. Sub-folders will be created, based on the magic numbers of the downloaded content (i.e. PE32, PDF, ZIP). For the sake of simplicity note that the script splits the file description string and only use the first 'token'. @@ -36,17 +38,30 @@ html code for low interaction analysis. Requirements: -- BeautifulSoup 3.0.8 (later versions seem to have problems parsing html): +- BeautifulSoup http://www.crummy.com/software/BeautifulSoup/ +- Python Magic + https://github.com/ahupp/python-magic Usage: -$ python mwcrawler.py +```$ python mwcrawler.py``` + +Use '-t' for thug analysis: + +```$ python mwcrawler.py -t``` + +Use '-d' to enable debug logging: + +```$ python mwcrawler.py -t -d``` + +Use '-o' to attempt to download samples marked as "offline" by Zeus Tracker: + +```$ python mwcrawler.py -t -d -o``` -Use '-t' for thug analysis -$ python mwcrawler.py -t + References: diff --git a/mwcrawler.py b/mwcrawler.py index 19d0d56..0f41c40 100644 --- a/mwcrawler.py +++ b/mwcrawler.py @@ -1,7 +1,9 @@ #!/usr/bin/python # Copyright (C) 2012 Ricardo Dias # -# Malware Crawler Module v0.4 +# Updated 07/05/2015 +# Updated by Francisco Donoso +# Malware Crawler Module v0.9 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -17,7 +19,10 @@ # along with this program. If not, see . # # Requirements: -# - BeautifulSoup 3.0.8 +# - BeautifulSoup 4.0 +# - File magic + + from BeautifulSoup import BeautifulSoup as bs import sys @@ -28,6 +33,9 @@ import os import socket import datetime +import logging +import re +import argparse # By default thug analyis is disabled isthug = False @@ -43,159 +51,324 @@ # load thug function, also checks if thug is installed def loadthug(): try: - sys.path.append('/opt/thug/src') + sys.path.append('/opt/thug/src') #add /opt/thug/src to the list of places to search for modules + global thug import thug + global isthug isthug = True - print "- Thug module loaded for html analysis" + logging.info("Thug module loaded for html analysis") except ImportError: - print "- No Thug module found, html code inspection won't be available" + logging.error("No Thug module found, html code inspection won't be available. Please verify Thug Path") -# determine file type for correct archival +#Use filemagic to determine filetype - now uses the same module as Thug def gettype(file): - ms = magic.open(magic.MAGIC_NONE) - ms.load() - return ms.buffer(file) + file_type = magic.from_buffer(file) + logging.debug("File: %s is filetype %s",file,file_type) + return file_type + +#Automatically send to Thug + +def thugOnly(url): + if not re.match('http',url): + url = 'http://'+url + + if isthug: + logging.info("Thug candidate: HTML code in %s",url) + + try: + thug.Thug([url])() + except Exception, e: + logging.error("thug error: %s",e) + return # beautifulsoup parser def parse(url): + logging.debug("Trying to parse the following source: %s",url) request = urllib2.Request(url) request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)') try: - http = bs(urllib2.urlopen(request)) + http = bs(urllib2.urlopen(request)) #grab a soup object and name it http except: - print "- Error parsing %s" % (url) + logging.error("Error parsing %s",url) return - return http + return http #return the http object +#decide what to do with the malicious urls def decisor(url): - if not re.match('http',url): - url = 'http://'+url + logging.debug("trying to decide what to do with: %s",url) + if not re.match('http',url): + url = 'http://'+url try: url_dl = urllib2.urlopen(url).read() + logging.debug("getting the contents of the malware URL: %s",url) except Exception, e: - #print "-- Error: %s" % e + logging.error("-- Error: %s on %s",e,url) return filetype = gettype(url_dl).split(' ')[0] - md5 = hashlib.md5(url_dl).hexdigest() + logging.debug("the filetype is: %s",filetype) + md5 = hashlib.md5(url_dl).hexdigest() - if (filetype == 'HTML'): + if (filetype == 'HTML'): if isthug: - print "-- Thug candidate: HTML code in %s" % url + logging.info("Thug candidate: HTML code in %s",url) try: thug.Thug([url])() except Exception, e: - print "- Thug error: %s" % e + logging.error("Thug error: %s",e) return else: - dest = '/opt/malware/unsorted/'+filetype - fpath = dest+'/'+str(md5) + dest = '/opt/malware/unsorted/'+filetype + logging.debug("Going to put this in %s",dest) + fpath = dest+'/'+str(md5) + logging.debug("the filepath will be: %s",fpath) - if not os.path.exists(dest): + if not os.path.exists(dest): os.makedirs(dest) if not os.path.exists(fpath): - file = open(fpath, 'wb') + file = open(fpath, 'wb') file.write(url_dl) file.close - print "-- Saved file type %s with md5: %s" % (filetype,md5) + logging.info("Saved file type %s with md5: %s",filetype,md5) + +#we know the file is not HTML just download it +def downloader(url): + if not re.match('http',url): + url = 'http://'+url + + try: + url_dl = urllib2.urlopen(url).read() + logging.debug("getting the contents of the malware URL: %s",url) + except Exception, e: + logging.debug("Unable to download: %s due to: %s",url,e) + return + + filetype = gettype(url_dl).split(' ')[0] + logging.debug("the filetype is: %s",filetype) + md5 = hashlib.md5(url_dl).hexdigest() + + dest = '/opt/malware/unsorted/'+filetype #one folder per filetype + logging.debug("Going to put this in %s",dest) + fpath = dest+'/'+str(md5) + logging.debug("the filepath will be: %s",fpath) + + if not os.path.exists(dest): + os.makedirs(dest) + + if not os.path.exists(fpath): + file = open(fpath, 'wb') + file.write(url_dl) + file.close + logging.info("-- Saved file type %s with md5: %s",filetype,md5) + +#remove the data from ThreatGlass URLs +def removedate(threat): + p = re.compile(ur'.\d{4}.((0\d)|(1[012])).(([012]\d)|3[01])') + return re.sub(p,'',threat) + +#Parse data from ThreatGlass +def parseThreatGlass(soup): + logging.info("Fetching from TheatGlass") + tglist = [] + for tag in soup.findAll('a', href=True): + tglist.append(tag['href']) + del tglist[:5] #remove all of their header links + del tglist[-1] #remove links to the next page + tglist = set(tglist) #only attempt to download unique urls + logging.info("Found %s urls on this page on ThreatGlass",len(tglist)) + for each in tglist: + thugOnly(removedate(str(each.split('/malicious_urls/')[1]).replace('-','.'))) + +#threatglass page crawler +def threatGlass(pages): + logging.info("Getting %s pages from ThreatGlass",pages) + for tile in range(1,pages+1): + parseThreatGlass(parse('http://threatglass.com/tiles?page=' + str(tile))) def malwaredl(soup): - print "- Fetching from Malware Domain List" + logging.info("Fetching from Malware Domain List") mdl=[] - for row in soup('description'): - mdl.append(row) - del mdl[0] - mdl_sites=[] + for row in soup('description'): + mdl.append(row) + del mdl[0] + mdl_sites=[] for row in mdl: - site = re.sub('&','&',str(row).split()[1]).replace(',','') + site = re.sub('&','&',str(row).split()[1]).replace(',','') #replace ampersand and remove , if site == '-': - mdl_sites.append(re.sub('&','&',str(row).split()[4]).replace(',','')) + mdl_sites.append(re.sub('&','&',str(row).split()[4]).replace(',','')) #heh else: - mdl_sites.append(site) - print "-- Found %s urls" % len(mdl) + mdl_sites.append(site) + mdl_sites = set(mdl_sites) + logging.info("Found %s urls on Malware Domain List",len(mdl)) for row in mdl_sites: decisor(row) +#VxVault def vxvault(soup): - print "- Fetching from VXVault" - vxv=[] - for row in soup('pre'): + logging.info("Fetching from VXVault") + vxv=[] + for row in soup('pre'): vxv = row.string.split('\r\n') - del vxv[:4] - del vxv[-1] - print "-- Found %s urls" % len(vxv) + del vxv[:4] #remove all of the non malware url stuff + del vxv[-1] #delete the useless last row + vxv = set(vxv) #only attempt to download unique urls + logging.info("Found %s urls from VXVault",len(vxv)) for row in vxv: - decisor(row) + decisor(row) #decide and download +#Malcode Parser def malc0de(soup): - print "- Fetching from Malc0de" - mlc=[] - for row in soup('description'): - mlc.append(row) + logging.info("Fetching from Malc0de") + mlc=[] + for row in soup('description'): + mlc.append(row) del mlc[0] - mlc_sites=[] + mlc_sites=[] for row in mlc: - site = re.sub('&','&',str(row).split()[1]).replace(',','') + site = re.sub('&','&',str(row).split()[1]).replace(',','') mlc_sites.append(site) - print "-- Found %s urls" % len(mlc_sites) + mlc_sites = set(mlc_sites) # + logging.info("Found %s urls from Malc0de",len(mlc_sites)) for row in mlc_sites: decisor(row) +#Malware BlackList parser def malwarebl(soup): - print "- Fetching from Malware Black List" + logging.info("Fetching from Malware Black List") mbl=[] for row in soup('description'): site = str(row).split()[1].replace(',','') mbl.append(site) - print "-- Found %s urls" % len(mbl) + mbl = set(mbl) + logging.info("Found %s urls from Malware Black List",len(mbl)) for row in mbl: decisor(row) -def minotaur(soup): - print "- Fetching from NovCon Minotaur" - min=[] - for row in soup('td'): - try: - if re.match('http',row.string): - min.append(row.string) - except: - pass - print "-- Found %s urls" % len(min) - for row in min: - decisor(row) +#CleanMX parser +def cleanmx(soup): + logging.info("Fetchingg from clean-mx.de") + cmxlist = [] + for each in soup.body.findAll('a', href=True, title="open Url in new Browser at your own risk !"): + site = re.sub('&','&',str(each['href'])) + site = urllib2.unquote(site).decode('utf8') + logging.debug("cleanmx parser was able to parse out: %s ",site) + cmxlist.append(site) + cmxlist = set(cmxlist) + logging.info("found %s urls from clean-mx.de",len(cmxlist)) + for site in cmxlist: + decisor(site) + +#zeus tracker binaries +def zeustrackerbin(soup): + logging.info("Fetching from Zeus Tracker Binaries RSS feed") + ztlist = [] + offline_list = [] + for each in soup('description'): + ztlist.append(each) + del ztlist[0] + for entry in ztlist: + url = re.search('(?:URL: )([^,]+)',str(entry)).group(1) + status = re.search('(?:status: )([^,]+)',str(entry)).group(1) + if args.offline: + logging.debug("attempting to download %s regarless of status",url) + downloader(url) + if status == "offline": + logging.debug("%s is marked as offline will not be downloaded",url) + offline_list.append(url) + else: + downloader(url) + if len(offline_list) == len(ztlist): + logging.warning("All parsed items listed as offline. Use -o to attempt to download anyway") -def sacour(soup): - print "- Fetching from Sacour.cn" - for url in soup('a'): - min=[] - if re.match('list/',url['href']): - suburl = parse('http://www.sacour.cn/'+url['href']) - for text in suburl('body'): - for urls in text.contents: - if re.match('http://',str(urls)): - min.append(str(urls)) - if len(min) > 0: - print "-- Found %s urls in %s" % (len(min),url['href']) - for row in min: - decisor(row) +#zeus tracker HTML +def zeustrackerhtml(soup): + logging.info("Fetching from Zeus Tracker DropZones RSS feed") + if not args.thug: + logging.warning("Thug analysis not enabled. Zeus Tracker dropzone will not be processed") + ztlist = [] + for each in soup('description'): + ztlist.append(each) + del ztlist[0] + offline_list = [] + for entry in ztlist: + url = re.search('(?:URL: )([^,]+)',str(entry)).group(1) + status = re.search('(?:status: )([^