From f770409cb8f21bf2daeb8b580a92f56417a6c4ba Mon Sep 17 00:00:00 2001
From: Angelo Dell'Aera <buffer@olografix.org>
Date: Wed, 21 Jun 2017 16:49:44 +0200
Subject: [PATCH 1/2] Multiple improvements

	- Code layout changes (PEP-8 coding style)
	- Python 3 compatibility
	- Parser class initialization refactoring
	- Compile the regexps with the IGNORECASE flag
---
 bin/iocp          |  20 +-
 iocp/Output.py    | 184 +++++++++--------
 iocp/Parser.py    | 506 +++++++++++++++++++++++-----------------------
 iocp/Whitelist.py |  12 --
 iocp/__init__.py  |   2 +-
 requirements.txt  |   2 +
 setup.py          |  40 ++--
 7 files changed, 382 insertions(+), 384 deletions(-)
 delete mode 100644 iocp/Whitelist.py

diff --git a/bin/iocp b/bin/iocp
index 8047f6f..f53b5ac 100755
--- a/bin/iocp
+++ b/bin/iocp
@@ -40,14 +40,14 @@ import argparse
 from iocp import Parser
 
 if __name__ == "__main__":
-	argparser = argparse.ArgumentParser()
-	argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)')
-	argparser.add_argument('-p', dest='INI', default=None, help='Pattern file')
-	argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html)')
-	argparser.add_argument('-o', dest='OUTPUT_FORMAT', default='csv', help='Output format (csv/tsv/json/yara/netflow)')
-	argparser.add_argument('-d', dest='DEDUP', action='store_true', default=False, help='Deduplicate matches')
-	argparser.add_argument('-l', dest='LIB', default='pdfminer', help='PDF parsing library (pypdf2/pdfminer)')
-	args = argparser.parse_args()
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)')
+    argparser.add_argument('-p', dest='INI', default=None, help='Pattern file')
+    argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html)')
+    argparser.add_argument('-o', dest='OUTPUT_FORMAT', default='csv', help='Output format (csv/tsv/json/yara/netflow)')
+    argparser.add_argument('-d', dest='DEDUP', action='store_true', default=False, help='Deduplicate matches')
+    argparser.add_argument('-l', dest='LIB', default='pdfminer', help='PDF parsing library (pypdf2/pdfminer)')
+    args = argparser.parse_args()
 
-	parser = Parser.Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT)
-	parser.parse(args.PATH)
\ No newline at end of file
+    parser = Parser.Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT)
+    parser.parse(args.PATH)
diff --git a/iocp/Output.py b/iocp/Output.py
index 93b109b..dbebe80 100644
--- a/iocp/Output.py
+++ b/iocp/Output.py
@@ -7,112 +7,120 @@
 
 OUTPUT_FORMATS = ('csv', 'tsv', 'json', 'yara', 'netflow', )
 
+
 def getHandler(output_format):
-	output_format = output_format.lower()
-	if output_format not in OUTPUT_FORMATS:
-		print("[WARNING] Invalid output format specified... using CSV")
-		output_format = 'csv'
+    output_format = output_format.lower()
+    if output_format not in OUTPUT_FORMATS:
+        print("[WARNING] Invalid output format specified... using CSV")
+        output_format = 'csv'
+
+    handler_format = "OutputHandler_" + output_format
+    handler_class = getattr(sys.modules[__name__], handler_format)
 
-	handler_format = "OutputHandler_" + output_format
-	handler_class = getattr(sys.modules[__name__], handler_format)
+    return handler_class()
 
-	return handler_class()
 
 class OutputHandler(object):
-	def print_match(self, fpath, page, name, match):
-		pass
+    def print_match(self, fpath, page, name, match):
+        pass
+
+    def print_header(self, fpath):
+        pass
 
-	def print_header(self, fpath):
-		pass
+    def print_footer(self, fpath):
+        pass
 
-	def print_footer(self, fpath):
-		pass
+    def print_error(self, fpath, exception):
+        print("[ERROR] %s" % (exception))
 
-	def print_error(self, fpath, exception):
-		print("[ERROR] %s" % (exception))
 
 class OutputHandler_csv(OutputHandler):
-	def __init__(self):
-		self.csv_writer = csv.writer(sys.stdout)
+    def __init__(self):
+        self.csv_writer = csv.writer(sys.stdout)
 
-	def print_match(self, fpath, page, name, match):
-		self.csv_writer.writerow((fpath, page, name, match))
+    def print_match(self, fpath, page, name, match):
+        self.csv_writer.writerow((fpath, page, name, match))
+
+    def print_error(self, fpath, exception):
+        self.csv_writer.writerow((fpath, '0', 'error', exception))
 
-	def print_error(self, fpath, exception):
-		self.csv_writer.writerow((fpath, '0', 'error', exception))
 
 class OutputHandler_tsv(OutputHandler):
-	def __init__(self):
-		self.csv_writer = csv.writer(sys.stdout, delimiter = '\t')
+    def __init__(self):
+        self.csv_writer = csv.writer(sys.stdout, delimiter = '\t')
+
+    def print_match(self, fpath, page, name, match):
+        self.csv_writer.writerow((fpath, page, name, match))
 
-	def print_match(self, fpath, page, name, match):
-		self.csv_writer.writerow((fpath, page, name, match))
+    def print_error(self, fpath, exception):
+        self.csv_writer.writerow((fpath, '0', 'error', exception))
 
-	def print_error(self, fpath, exception):
-		self.csv_writer.writerow((fpath, '0', 'error', exception))
 
 class OutputHandler_json(OutputHandler):
-	def print_match(self, fpath, page, name, match):
-		data = {
-			'path' : fpath,
-			'file' : os.path.basename(fpath),
-			'page' : page,
-			'type' : name,
-			'match': match
-		}
-
-		print(json.dumps(data))
-
-	def print_error(self, fpath, exception):
-		data = {
-			'path'      : fpath,
-			'file'      : os.path.basename(fpath),
-			'type'      : 'error',
-			'exception' : exception
-		}
-
-		print(json.dumps(data))
+    def print_match(self, fpath, page, name, match):
+        data = {
+            'path' : fpath,
+            'file' : os.path.basename(fpath),
+            'page' : page,
+            'type' : name,
+            'match': match
+        }
+
+        print(json.dumps(data))
+
+    def print_error(self, fpath, exception):
+        data = {
+            'path'      : fpath,
+            'file'      : os.path.basename(fpath),
+            'type'      : 'error',
+            'exception' : exception
+        }
+
+        print(json.dumps(data))
+
 
 class OutputHandler_yara(OutputHandler):
-	def __init__(self):
-		self.rule_enc = ''.join(chr(c) if chr(c).isupper() or chr(c).islower() or chr(c).isdigit() else '_' for c in range(256))
-
-	def print_match(self, fpath, page, name, match):
-		if name in self.cnt:
-			self.cnt[name] += 1
-		else:
-			self.cnt[name] = 1
-		
-		string_id = "$%s%d" % (name, self.cnt[name])
-		self.sids.append(string_id)
-		string_value = match.replace('\\', '\\\\')
-		print("\t\t%s = \"%s\"" % (string_id, string_value))
-
-	def print_header(self, fpath):
-		rule_name = os.path.splitext(os.path.basename(fpath))[0].translate(self.rule_enc)
-
-		print("rule %s" % (rule_name))
-		print("{")
-		print("\tstrings:")
-
-		self.cnt = {}
-		self.sids = []
-
-	def print_footer(self, fpath):
-		cond = ' or '.join(self.sids)
-
-		print("\tcondition:")
-		print("\t\t" + cond)
-		print("}")
-		
+    def __init__(self):
+        self.rule_enc = ''.join(chr(c) if chr(c).isupper() or chr(c).islower() or chr(c).isdigit() else '_' for c in range(256))
+
+    def print_match(self, fpath, page, name, match):
+        if name in self.cnt:
+            self.cnt[name] += 1
+        else:
+            self.cnt[name] = 1
+
+        string_id = "$%s%d" % (name, self.cnt[name])
+        self.sids.append(string_id)
+        string_value = match.replace('\\', '\\\\')
+        print("\t\t%s = \"%s\"" % (string_id, string_value))
+
+    def print_header(self, fpath):
+        rule_name = os.path.splitext(os.path.basename(fpath))[0].translate(self.rule_enc)
+
+        print("rule %s" % (rule_name))
+        print("{")
+        print("\tstrings:")
+
+        self.cnt = {}
+        self.sids = []
+
+    def print_footer(self, fpath):
+        cond = ' or '.join(self.sids)
+
+        print("\tcondition:")
+        print("\t\t" + cond)
+        print("}")
+
+
 class OutputHandler_netflow(OutputHandler):
-	def __init__(self):
-		print "host 255.255.255.255"
-
-	def print_match(self, fpath, page, name, match):
-		data = {
-			'type' : name,
-			'match': match
-		}
-		if data["type"] == "IP":
-			print " or host %s " % data["match"]
+    def __init__(self):
+        print "host 255.255.255.255"
+
+    def print_match(self, fpath, page, name, match):
+        data = {
+            'type' : name,
+            'match': match
+        }
+
+        if data["type"] == "IP":
+            print " or host %s " % data["match"]
diff --git a/iocp/Parser.py b/iocp/Parser.py
index f8d084a..d5475fc 100644
--- a/iocp/Parser.py
+++ b/iocp/Parser.py
@@ -35,274 +35,280 @@
 #
 ###################################################################################################
 
-import os
 import sys
+import os
 import fnmatch
 import glob
 import re
-try:
-	import configparser as ConfigParser
-except ImportError:
-	import ConfigParser
-try:
-    from StringIO import StringIO
-except ImportError:
-    from io import StringIO
+import six.moves.configparser as ConfigParser
+from six import StringIO
 
 # Import optional third-party libraries
 IMPORTS = []
+
 try:
-	from PyPDF2 import PdfFileReader
-	IMPORTS.append('pypdf2')
+    from PyPDF2 import PdfFileReader
+    IMPORTS.append('pypdf2')
 except ImportError:
-	pass
+    pass
+
 try:
-	from pdfminer.pdfpage import PDFPage
-	from pdfminer.pdfinterp import PDFResourceManager
-	from pdfminer.converter import TextConverter
-	from pdfminer.pdfinterp import PDFPageInterpreter
-	from pdfminer.layout import LAParams
-	IMPORTS.append('pdfminer')
+    from pdfminer.pdfpage import PDFPage
+    from pdfminer.pdfinterp import PDFResourceManager
+    from pdfminer.converter import TextConverter
+    from pdfminer.pdfinterp import PDFPageInterpreter
+    from pdfminer.layout import LAParams
+    IMPORTS.append('pdfminer')
 except ImportError:
-	pass
+    pass
+
 try:
-	from bs4 import BeautifulSoup
-	IMPORTS.append('beautifulsoup')
+    from bs4 import BeautifulSoup
+    IMPORTS.append('beautifulsoup')
 except ImportError:
-	pass
+    pass
+
 try:
-	import requests
-	IMPORTS.append('requests')
+    import requests
+    IMPORTS.append('requests')
 except ImportError:
-	pass
+    pass
 
 # Import project source files
 import iocp
 from iocp import Output
 
+
 class Parser(object):
-	patterns = {}
-	defang = {}
-
-	def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='pdfminer', output_format='csv', output_handler=None):
-		basedir = iocp.get_basedir()
-
-		if patterns_ini is None:
-			patterns_ini = os.path.join(basedir, 'data/patterns.ini')
-		self.load_patterns(patterns_ini)
-
-		wldir = os.path.join(basedir, 'data/whitelists')
-		self.whitelist = self.load_whitelists(wldir)
-
-		self.dedup = dedup
-		if output_handler:
-			self.handler = output_handler
-		else:
-			self.handler = Output.getHandler(output_format)
-
-		self.ext_filter = "*." + input_format
-		parser_format = "parse_" + input_format
-		try:
-			self.parser_func = getattr(self, parser_format)
-		except AttributeError:
-			e = 'Selected parser format is not supported: %s' % (input_format)
-			raise NotImplementedError(e)
-
-		self.library = library
-		if input_format == 'pdf':
-			if library not in IMPORTS:
-				e = 'Selected PDF parser library not found: %s' % (library)
-				raise ImportError(e)
-		elif input_format == 'html':
-			if 'beautifulsoup' not in IMPORTS:
-				e = 'HTML parser library not found: BeautifulSoup'
-				raise ImportError(e)
-
-	def load_patterns(self, fpath):
-		config = ConfigParser.ConfigParser()
-		with open(fpath) as f:
-			config.readfp(f)
-
-		for ind_type in config.sections():
-			try:
-				ind_pattern = config.get(ind_type, 'pattern')
-			except:
-				continue
-
-			if ind_pattern:
-				ind_regex = re.compile(ind_pattern)
-				self.patterns[ind_type] = ind_regex
-
-			try:
-				ind_defang = config.get(ind_type, 'defang')
-			except:
-				continue
-
-			if ind_defang:
-				self.defang[ind_type] = True
-
-	def load_whitelists(self, fpath):
-		whitelist = {}
-
-		searchdir = os.path.join(fpath, "whitelist_*.ini")
-		fpaths = glob.glob(searchdir)
-		for fpath in fpaths:
-			t = os.path.splitext(os.path.split(fpath)[1])[0].split('_',1)[1]
-			patterns = [line.strip() for line in open(fpath)]
-			whitelist[t]  = [re.compile(p) for p in patterns]
-
-		return whitelist
-
-	def is_whitelisted(self, ind_match, ind_type):
-		try:
-			for w in self.whitelist[ind_type]:
-				if w.findall(ind_match):
-					return True
-		except KeyError as e:
-			pass
-		return False
-
-	def parse_page(self, fpath, data, page_num):
-		for ind_type, ind_regex in self.patterns.items():
-			matches = ind_regex.findall(data)
-
-			for ind_match in matches:
-				if isinstance(ind_match, tuple):
-					ind_match = ind_match[0]
-
-				if self.is_whitelisted(ind_match, ind_type):
-					continue
-
-				if ind_type in self.defang:
-					ind_match = re.sub(r'\[\.\]', '.', ind_match)
-
-				if self.dedup:
-					if (ind_type, ind_match) in self.dedup_store:
-						continue
-
-					self.dedup_store.add((ind_type, ind_match))
-
-				self.handler.print_match(fpath, page_num, ind_type, ind_match)
-
-	def parse_pdf_pypdf2(self, f, fpath):
-		try:
-			pdf = PdfFileReader(f, strict = False)
-
-			if self.dedup:
-				self.dedup_store = set()
-
-			self.handler.print_header(fpath)
-			page_num = 0
-			for page in pdf.pages:
-				page_num += 1
-
-				data = page.extractText()
-
-				self.parse_page(fpath, data, page_num)
-			self.handler.print_footer(fpath)
-		except (KeyboardInterrupt, SystemExit):
-			raise
-
-	def parse_pdf_pdfminer(self, f, fpath):
-		try:
-			laparams = LAParams()
-			laparams.all_texts = True  
-			rsrcmgr = PDFResourceManager()
-			pagenos = set()
-
-			if self.dedup:
-				self.dedup_store = set()
-
-			self.handler.print_header(fpath)
-			page_num = 0
-			for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
-				page_num += 1
-
-				retstr = StringIO()
-				device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
-				interpreter = PDFPageInterpreter(rsrcmgr, device)
-				interpreter.process_page(page)
-				data = retstr.getvalue()
-				retstr.close()
-
-				self.parse_page(fpath, data, page_num)
-			self.handler.print_footer(fpath)
-		except (KeyboardInterrupt, SystemExit):
-			raise
-
-	def parse_pdf(self, f, fpath):
-		parser_format = "parse_pdf_" + self.library
-		try:
-			self.parser_func = getattr(self, parser_format)
-		except AttributeError:
-			e = 'Selected PDF parser library is not supported: %s' % (self.library)
-			raise NotImplementedError(e)
-			
-		self.parser_func(f, fpath)
-
-	def parse_txt(self, f, fpath):
-		try:
-			if self.dedup:
-				self.dedup_store = set()
-
-			data = f.read()
-			self.handler.print_header(fpath)
-			self.parse_page(fpath, data, 1)
-			self.handler.print_footer(fpath)
-		except (KeyboardInterrupt, SystemExit):
-			raise
-
-	def parse_html(self, f, fpath):
-		try:
-			if self.dedup:
-				self.dedup_store = set()
-				
-			data = f.read()
-			soup = BeautifulSoup(data)
-			html = soup.findAll(text=True)
-
-			text = u''
-			for elem in html:
-				if elem.parent.name in ['style', 'script', '[document]', 'head', 'title']:
-					continue
-				elif re.match('<!--.*-->', unicode(elem)):
-					continue
-				else:
-					text += unicode(elem)
-
-			self.handler.print_header(fpath)
-			self.parse_page(fpath, text, 1)
-			self.handler.print_footer(fpath)
-		except (KeyboardInterrupt, SystemExit):
-			raise
-
-	def parse(self, path):
-		try:
-			if path.startswith('http://') or path.startswith('https://'):
-				if 'requests' not in IMPORTS:
-					e = 'HTTP library not found: requests'
-					raise ImportError(e)
-				headers = { 'User-Agent': 'Mozilla/5.0 Gecko Firefox' }
-				r = requests.get(path, headers=headers)
-				r.raise_for_status()
-				f = StringIO(r.content)
-				self.parser_func(f, path)
-				return
-			elif os.path.isfile(path):
-				with open(path, 'rb') as f:
-					self.parser_func(f, path)
-				return
-			elif os.path.isdir(path):
-				for walk_root, walk_dirs, walk_files in os.walk(path):
-					for walk_file in fnmatch.filter(walk_files, self.ext_filter):
-						fpath = os.path.join(walk_root, walk_file)
-						with open(fpath, 'rb') as f:
-							self.parser_func(f, fpath)
-				return
-
-			e = 'File path is not a file, directory or URL: %s' % (path)
-			raise IOError(e)
-		except (KeyboardInterrupt, SystemExit):
-			raise
-		except Exception as e:
-			self.handler.print_error(path, e)
\ No newline at end of file
+    patterns = {}
+    defang   = {}
+
+    def __init__(self, patterns_ini = None, input_format = 'pdf', dedup = False, library = 'pdfminer', output_format = 'csv', output_handler = None):
+        self.__init_patterns(patterns_ini)
+        self.__init_whitelist()
+        self.__init_dedup(dedup)
+        self.__init_output_handler(output_format, output_handler)
+        self.__init_parser(input_format)
+        self.__init_library(library, input_format)
+
+    def __init_patterns(self, patterns_ini):
+        if patterns_ini is None:
+            patterns_ini = os.path.join(iocp.get_basedir(), 'data/patterns.ini')
+
+        self.load_patterns(patterns_ini)
+
+    def __init_whitelist(self):
+        wldir = os.path.join(iocp.get_basedir(), 'data/whitelists')
+        self.whitelist = self.load_whitelists(wldir)
+
+    def __init_dedup(self, dedup):
+        self.dedup = dedup
+
+        if dedup:
+            self.dedup_store = set()
+
+    def __init_output_handler(self, output_format, output_handler):
+        self.handler = output_handler if output_handler else Output.getHandler(output_format)
+
+    def __init_parser(self, input_format):
+        self.ext_filter = "*.{}".format(input_format)
+        parser_format = "parse_{}".format(input_format)
+
+        self.parser_func = getattr(self, parser_format, None)
+        if not self.parser_func:
+            print('Selected parser format is not supported: {}'.format(input_format))
+            sys.exit(-1)
+
+    def __init_library(self, library, input_format):
+        self.library = library
+
+        if input_format in ('pdf', ) and library not in IMPORTS:
+            print('PDF parser library not found: {}'.format(library))
+            sys.exit(-1)
+
+        if input_format in ('html', ) and 'beautifulsoup' not in IMPORTS:
+            print('HTML parser library not found: BeautifulSoup')
+            sys.exit(-1)
+
+    def load_patterns(self, fpath):
+        config = ConfigParser.ConfigParser()
+
+        with open(fpath) as f:
+            config.readfp(f)
+
+        for ind_type in config.sections():
+            try:
+                ind_pattern = config.get(ind_type, 'pattern')
+            except ConfigParser.NoOptionError:
+                continue
+
+            if ind_pattern:
+                ind_regex = re.compile(ind_pattern, flags = re.IGNORECASE)
+                self.patterns[ind_type] = ind_regex
+
+            try:
+                ind_defang = config.get(ind_type, 'defang')
+            except ConfigParser.NoOptionError:
+                continue
+
+            if ind_defang:
+                self.defang[ind_type] = True
+
+    def load_whitelists(self, fpath):
+        whitelist = {}
+
+        searchdir = os.path.join(fpath, "whitelist_*.ini")
+        fpaths = glob.glob(searchdir)
+        for fpath in fpaths:
+            t = os.path.splitext(os.path.split(fpath)[1])[0].split('_', 1)[1]
+            patterns = [line.strip() for line in open(fpath)]
+            whitelist[t] = [re.compile(p, flags = re.IGNORECASE) for p in patterns]
+
+        return whitelist
+
+    def is_whitelisted(self, ind_match, ind_type):
+        try:
+            for w in self.whitelist[ind_type]:
+                if w.findall(ind_match):
+                    return True
+        except KeyError:
+            pass
+
+        return False
+
+    def parse_page(self, fpath, data, page_num):
+        for ind_type, ind_regex in self.patterns.items():
+            matches = ind_regex.findall(data)
+
+            for ind_match in matches:
+                if isinstance(ind_match, tuple):
+                    ind_match = ind_match[0]
+
+                if self.is_whitelisted(ind_match, ind_type):
+                    continue
+
+                if ind_type in self.defang:
+                    ind_match = re.sub(r'\[\.\]', '.', ind_match)
+
+                if self.dedup:
+                    if (ind_type, ind_match) in self.dedup_store:
+                        continue
+
+                    self.dedup_store.add((ind_type, ind_match))
+
+                self.handler.print_match(fpath, page_num, ind_type, ind_match)
+
+    def parse_pdf_pypdf2(self, f, fpath):
+        try:
+            pdf = PdfFileReader(f, strict = False)
+
+            self.handler.print_header(fpath)
+            page_num = 0
+            for page in pdf.pages:
+                page_num += 1
+
+                data = page.extractText()
+
+                self.parse_page(fpath, data, page_num)
+
+            self.handler.print_footer(fpath)
+        except (KeyboardInterrupt, SystemExit):
+            raise
+
+    def parse_pdf_pdfminer(self, f, fpath):
+        try:
+            laparams = LAParams()
+            laparams.all_texts = True
+            rsrcmgr = PDFResourceManager()
+            pagenos = set()
+
+            self.handler.print_header(fpath)
+            page_num = 0
+
+            for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
+                page_num += 1
+
+                retstr = StringIO()
+                device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
+                interpreter = PDFPageInterpreter(rsrcmgr, device)
+                interpreter.process_page(page)
+                data = retstr.getvalue()
+                retstr.close()
+
+                self.parse_page(fpath, data, page_num)
+
+            self.handler.print_footer(fpath)
+        except (KeyboardInterrupt, SystemExit):
+            raise
+
+    def parse_pdf(self, f, fpath):
+        parser_format = "parse_pdf_" + self.library
+
+        self.parser_func = getattr(self, parser_format, None)
+        if not self.parser_func:
+            e = 'Selected PDF parser library is not supported: {}'.format(self.library)
+            raise NotImplementedError(e)
+
+        self.parser_func(f, fpath)
+
+    def parse_txt(self, f, fpath):
+        try:
+            data = f.read()
+            self.handler.print_header(fpath)
+            self.parse_page(fpath, data, 1)
+            self.handler.print_footer(fpath)
+        except (KeyboardInterrupt, SystemExit):
+            raise
+
+    def parse_html(self, f, fpath):
+        try:
+            data = f.read()
+            soup = BeautifulSoup(data, "lxml")
+            html = soup.findAll(text = True)
+
+            text = u''
+            for elem in html:
+                if elem.parent.name in ['style', 'script', '[document]', 'head', 'title']:
+                    continue
+                elif re.match('<!--.*-->', unicode(elem)):
+                    continue
+                else:
+                    text += unicode(elem)
+
+            self.handler.print_header(fpath)
+            self.parse_page(fpath, text, 1)
+            self.handler.print_footer(fpath)
+        except (KeyboardInterrupt, SystemExit):
+            raise
+
+    def parse(self, path):
+        try:
+            if path.startswith('http://') or path.startswith('https://'):
+                if 'requests' not in IMPORTS:
+                    e = 'HTTP library not found: requests'
+                    raise ImportError(e)
+
+                headers = {'User-Agent': 'Mozilla/5.0 Gecko Firefox'}
+                r = requests.get(path, headers = headers)
+                r.raise_for_status()
+                f = StringIO(r.content)
+                self.parser_func(f, path)
+                return
+            if os.path.isfile(path):
+                with open(path, 'rb') as f:
+                    self.parser_func(f, path)
+                return
+            if os.path.isdir(path):
+                for walk_root, walk_dirs, walk_files in os.walk(path):
+                    for walk_file in fnmatch.filter(walk_files, self.ext_filter):
+                        fpath = os.path.join(walk_root, walk_file)
+                        with open(fpath, 'rb') as f:
+                            self.parser_func(f, fpath)
+                return
+
+            e = 'File path is not a file, directory or URL: %s' % (path)
+            raise IOError(e)
+        except (KeyboardInterrupt, SystemExit):
+            raise
+        except Exception as e:
+            self.handler.print_error(path, e)
diff --git a/iocp/Whitelist.py b/iocp/Whitelist.py
deleted file mode 100644
index 5f12d31..0000000
--- a/iocp/Whitelist.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import os
-import glob
-import re
-
-class WhiteList(dict):
-    def __init__(self, basedir):
-    	searchdir = os.path.join(basedir, "whitelists/whitelist_*.ini")
-        fpaths = glob.glob(searchdir)
-        for fpath in fpaths:
-            t = os.path.splitext(os.path.split(fpath)[1])[0].split('_',1)[1]
-            patterns = [line.strip() for line in open(fpath)]
-            self[t]  = [re.compile(p) for p in patterns]
\ No newline at end of file
diff --git a/iocp/__init__.py b/iocp/__init__.py
index 31fcb29..b703061 100644
--- a/iocp/__init__.py
+++ b/iocp/__init__.py
@@ -5,4 +5,4 @@
 _IOCP_ROOT = os.path.abspath(os.path.dirname(__file__))
 
 def get_basedir():
-	return _IOCP_ROOT
\ No newline at end of file
+    return _IOCP_ROOT
diff --git a/requirements.txt b/requirements.txt
index 5ff7a2e..9d0c62d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,5 @@ beautifulsoup4>=4.4.1
 pdfminer>=20140328
 PyPDF2>=1.26.0
 requests>=2.10.0
+six>=1.10.0
+lxml>=3.8.0
diff --git a/setup.py b/setup.py
index 2186826..5bdaa82 100755
--- a/setup.py
+++ b/setup.py
@@ -1,28 +1,22 @@
 #!/usr/bin/env python
 
-import os
 from setuptools import setup
 
 setup(
-	name = "ioc_parser",
-	version = "0.9.1",
-	author = "Armin Buescher",
-	author_email = "armin.buescher@googlemail.com",
-	scripts=['bin/iocp'],
-	description = ("Tool to extract indicators of compromise from security reports"),
-	license = "MIT",
-	url = "https://github.com/armbues/ioc_parser",
-	packages=['iocp'],
-	include_package_data=True,
-	classifiers=[
-		"Development Status :: 4 - Beta",
-		"Topic :: Security",
-		"License :: OSI Approved :: MIT License",
-	],
-	install_requires=[
-		"pdfminer",
-		"PyPDF2",
-		"requests",
-		"beautifulsoup4"
-	],
-)
\ No newline at end of file
+    name = "ioc_parser",
+    version = "0.9.1",
+    author = "Armin Buescher",
+    author_email = "armin.buescher@googlemail.com",
+    scripts = ['bin/iocp'],
+    description = ("Tool to extract indicators of compromise from security reports"),
+    license = "MIT",
+    url = "https://github.com/armbues/ioc_parser",
+    packages = ['iocp'],
+    include_package_data = True,
+    classifiers = [
+        "Development Status :: 4 - Beta",
+        "Topic :: Security",
+        "License :: OSI Approved :: MIT License",
+    ],
+    install_requires= open("requirements.txt").read().splitlines(),
+)

From 28ba90ef3d044352780b78440c3be91d9104137c Mon Sep 17 00:00:00 2001
From: Floyd Hightower <floyd.hightower27@gmail.com>
Date: Wed, 18 Oct 2017 10:29:19 -0400
Subject: [PATCH 2/2] Fixing remaining python2 print statements

---
 iocp/Output.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/iocp/Output.py b/iocp/Output.py
index dbebe80..1e7e26a 100644
--- a/iocp/Output.py
+++ b/iocp/Output.py
@@ -114,7 +114,7 @@ def print_footer(self, fpath):
 
 class OutputHandler_netflow(OutputHandler):
     def __init__(self):
-        print "host 255.255.255.255"
+        print("host 255.255.255.255")
 
     def print_match(self, fpath, page, name, match):
         data = {
@@ -123,4 +123,4 @@ def print_match(self, fpath, page, name, match):
         }
 
         if data["type"] == "IP":
-            print " or host %s " % data["match"]
+            print(" or host %s " % data["match"])