Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added new parsing features #29

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
6 changes: 3 additions & 3 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
The MIT License (MIT)

Copyright (c) 2015 armbues
Original work: Copyright (c) 2015 armbues
Additional work: (c) Copyright 2016 Hewlett Packard Enterprise Development LP

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand All @@ -18,5 +19,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

SOFTWARE.
14 changes: 10 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ IOC Parser is a tool to extract indicators of compromise from security reports i

## Usage
**iocp.py [-h] [-p INI] [-i FORMAT] [-o FORMAT] [-d] [-l LIB] FILE**
* *FILE* File/directory path to report(s)
* *FILE* File/directory path to report(s)/Gmail account in double quotes ("[email protected] password")
* *-p INI* Pattern file
* *-i FORMAT* Input format (pdf/txt/html)
* *-o FORMAT* Output format (csv/json/yara)
* *-i FORMAT* Input format (pdf/txt/html/csv/xls/xlsx/gmail)
* *-o FORMAT* Output format (csv/json/yara/netflow)
* *-d* Deduplicate matches
* *-l LIB* Parsing library

Expand All @@ -19,4 +19,10 @@ For HTML parsing support:
* [BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/) - *pip install beautifulsoup4*

For HTTP(S) support:
* [requests](http://docs.python-requests.org/en/latest/) - *pip install requests*
* [requests](http://docs.python-requests.org/en/latest/) - *pip install requests*

For XLS/XLSX support:
* [xlrd](https://github.com/python-excel/xlrd) - *pip install xlrd*

For Gmail support:
* [gmail](https://github.com/charlierguo/gmail)
197 changes: 188 additions & 9 deletions iocp.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import fnmatch
import argparse
import re
import csv
from StringIO import StringIO
try:
import configparser as ConfigParser
Expand All @@ -48,6 +49,18 @@

# Import optional third-party libraries
IMPORTS = []
try:
import xlrd
IMPORTS.append('xlrd')
except ImportError:
pass

try:
import gmail
IMPORTS.append('gmail')
except ImportError:
pass

try:
from PyPDF2 import PdfFileReader
IMPORTS.append('pypdf2')
Expand Down Expand Up @@ -81,14 +94,24 @@ class IOC_Parser(object):
patterns = {}
defang = {}

def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='pdfminer', output_format='csv', output_handler=None):
def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='pdfminer', output_format='csv', proxy=None, output_handler=None):
basedir = os.path.dirname(os.path.abspath(__file__))
if patterns_ini is None:
patterns_ini = os.path.join(basedir, 'patterns.ini')

self.load_patterns(patterns_ini)
self.whitelist = WhiteList(basedir)
self.dedup = dedup

# Depending on the type of proxy, set the proper proxy setting for storage to be used with Requests
if proxy is not None:
if proxy.startswith('http://'):
self.proxy = {'http': proxy}
elif proxy.startswith('https://'):
self.proxy = {'https': proxy}
else:
self.proxy = proxy

if output_handler:
self.handler = output_handler
else:
Expand All @@ -111,6 +134,14 @@ def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='
if 'beautifulsoup' not in IMPORTS:
e = 'HTML parser library not found: BeautifulSoup'
raise ImportError(e)
elif input_format == 'xlsx':
if 'xlrd' not in IMPORTS:
e = 'XLRD Library not found. Please visit: https://github.com/python-excel/xlrd or pip install xlrd'
raise ImportError(e)
elif input_format == 'gmail':
if 'gmail' not in IMPORTS:
e = 'Gmail library not found. Please visit: https://github.com/charlierguo/gmail'
raise ImportError(e)

def load_patterns(self, fpath):
config = ConfigParser.ConfigParser()
Expand Down Expand Up @@ -144,7 +175,20 @@ def is_whitelisted(self, ind_match, ind_type):
pass
return False

def parse_page(self, fpath, data, page_num):
def parse_page(self, fpath, data, page_num, flag=0, sheet_name=''):
""" Added flag and sheet_name variables for new inputs to help properly
print output

@param fpath: the file path, directory, URL or email account
@param data: the data to be parsed
@param page_num: the page number of a pdf, line number of csv, xls or xlsx
@param flag:
0 = default (pdf/txt/html)
1 = gmail
2 = csv
3 = xls and xlsx
@param sheet_name: to be used only with Excel spreadsheets
"""
for ind_type, ind_regex in self.patterns.items():
matches = ind_regex.findall(data)

Expand All @@ -164,7 +208,8 @@ def parse_page(self, fpath, data, page_num):

self.dedup_store.add((ind_type, ind_match))

self.handler.print_match(fpath, page_num, ind_type, ind_match)
# Added flag and sheet_name to determine which type of output to display
self.handler.print_match(fpath, page_num, ind_type, ind_match, flag, sheet_name)

def parse_pdf_pypdf2(self, f, fpath):
try:
Expand Down Expand Up @@ -246,7 +291,7 @@ def parse_html(self, f, fpath):
self.dedup_store = set()

data = f.read()
soup = BeautifulSoup(data)
soup = BeautifulSoup(data, 'html.parser') # Add "html.parser" to supress user warning
html = soup.findAll(text=True)

text = u''
Expand All @@ -266,15 +311,139 @@ def parse_html(self, f, fpath):
except Exception as e:
self.handler.print_error(fpath, e)

def parse_csv(self, f, fpath):
""" This method is used to parse a csv file. The flag
used for this method to send to output.py is 2.

@author Robb Krasnow
"""
try:
if self.dedup:
self.dedup_store = set()

self.handler.print_header(fpath)

with open(fpath, 'rb') as csvfile:
csv_data = csv.reader(csvfile, delimiter=',', quotechar='|')

for row in csv_data:
line = ', '.join(row).rstrip()
unicode_output = unicode(line, 'ascii', errors='ignore')

self.parse_page(fpath, unicode_output, csv_data.line_num, 2)

self.handler.print_footer(fpath)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
self.handler.print_error(fpath, e)


def parse_xls(self, f, fpath):
""" Created this function just to allow a user to use 'xls' as an input
option without any errors.

@author Robb Krasnow
"""
self.parse_xlsx(f, fpath)


def parse_xlsx(self, f, fpath):
""" This method is used to parse Microsoft Excel files
with either .xls or .xlsx extentions. The flag
used for this method to send to output.py is 3. Because
Excel spreadsheets may have multiple tabs, the sheet's
name is passed through the parse_page method in turn showing
that in the output.

@author Robb Krasnow
"""
try:
if self.dedup:
self.dedup_store = set()

self.handler.print_header(fpath)
workbook = xlrd.open_workbook(fpath)
sheets = workbook.sheets()

for sheet in sheets:
sheet_name = sheet.name

for row in range(sheet.nrows):
for col in range(sheet.ncols):
if sheet.cell_value(row, col) is not xlrd.empty_cell.value:
val = repr(sheet.cell_value(row, col))

self.parse_page(fpath, val, row+1, 3, sheet_name)

self.handler.print_footer(fpath)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
self.handler.print_error(fpath, e)


def parse_gmail(self, username, password):
""" This method is used to parse the inbox of a valid
Gmail account. The flag used for this method to send to
output.py is 1.

@author Robb Krasnow
@param username The gmail account's username
@param password The gmail account's password
"""
try:
if self.dedup:
self.dedup_store = set()

# Log the user in
g = gmail.login(username, password)

# When the user is logged in, grab all the email from their inbox
# and parse all the messages for IOCs
if g.logged_in:
print '***** Login Successful. *****\n'

self.handler.print_header(username)
emails = g.inbox().mail()

for email in range(0, len(emails)):
try:
emails[email].fetch()
content = emails[email].body
subject = re.sub('(^\s|re:\s+|\r\n|fwd:\s+)', '', emails[email].subject, flags=re.IGNORECASE)

self.parse_page(subject, content, 1, 1)
except Exception as e:
continue

self.handler.print_footer(username)

print '\n***** %s emails found. *****' % len(emails)
g.logout()
print '***** Logout Successful. *****'
else:
sys.exit()
except gmail.exceptions.AuthenticationError:
print 'Authentication Error'
sys.exit()


def parse(self, path):
try:
if path.startswith('http://') or path.startswith('https://'):
if 'requests' not in IMPORTS:
e = 'HTTP library not found: requests'
raise ImportError(e)
headers = { 'User-Agent': 'Mozilla/5.0 Gecko Firefox' }
r = requests.get(path, headers=headers)
r.raise_for_status()

# If using proxy, make request with proxy from --proxy switch
# Otherwise make the call normally
if self.proxy is not None:
r = requests.get(path, headers=headers, proxies=self.proxy)
else:
r = requests.get(path, headers=headers)

f = StringIO(r.content)
self.parser_func(f, path)
return
Expand All @@ -289,6 +458,15 @@ def parse(self, path):
with open(fpath, 'rb') as f:
self.parser_func(f, fpath)
return
# Check if the input from CLI has @gmail.com attached
# If so, grab the credentials, and send them to parse_gmail()
elif path.count('@gmail.com ') == 1 and len(path.split()) == 2:
gmail_account = path.split()
username = gmail_account[0]
password = gmail_account[1]
self.parser_func(username, password)

return

e = 'File path is not a file, directory or URL: %s' % (path)
raise IOError(e)
Expand All @@ -299,13 +477,14 @@ def parse(self, path):

if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)')
argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)/Gmail account in double quotes ("[email protected] password")')
argparser.add_argument('-p', dest='INI', default=None, help='Pattern file')
argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html)')
argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html/csv/xls/xlsx/gmail)')
argparser.add_argument('-o', dest='OUTPUT_FORMAT', default='csv', help='Output format (csv/json/yara/netflow)')
argparser.add_argument('-d', dest='DEDUP', action='store_true', default=False, help='Deduplicate matches')
argparser.add_argument('-l', dest='LIB', default='pdfminer', help='PDF parsing library (pypdf2/pdfminer)')
argparser.add_argument('--proxy', dest='PROXY', default=None, help='Sets proxy (http(s)://server:port)')
args = argparser.parse_args()

parser = IOC_Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT)
parser = IOC_Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT, args.PROXY)
parser.parse(args.PATH)
Loading