armbues · robbkrasnow · Mar 2, 2016 · Mar 2, 2016 · Mar 2, 2016 · Mar 3, 2016
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,7 @@
 The MIT License (MIT)
 
-Copyright (c) 2015 armbues
+Original work: Copyright (c) 2015 armbues
+Additional work: (c) Copyright 2016 Hewlett Packard Enterprise Development LP
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -18,5 +19,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -3,10 +3,10 @@ IOC Parser is a tool to extract indicators of compromise from security reports i
 
 ## Usage
 **iocp.py [-h] [-p INI] [-i FORMAT] [-o FORMAT] [-d] [-l LIB] FILE**
-* *FILE* File/directory path to report(s)
+* *FILE* File/directory path to report(s)/Gmail account in double quotes ("[email protected] password")
 * *-p INI* Pattern file
-* *-i FORMAT* Input format (pdf/txt/html)
-* *-o FORMAT* Output format (csv/json/yara)
+* *-i FORMAT* Input format (pdf/txt/html/csv/xls/xlsx/gmail)
+* *-o FORMAT* Output format (csv/json/yara/netflow)
 * *-d* Deduplicate matches
 * *-l LIB* Parsing library
 
@@ -19,4 +19,10 @@ For HTML parsing support:
 * [BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/) - *pip install beautifulsoup4*
 
 For HTTP(S) support:
-* [requests](http://docs.python-requests.org/en/latest/) - *pip install requests*
+* [requests](http://docs.python-requests.org/en/latest/) - *pip install requests*
+
+For XLS/XLSX support:
+* [xlrd](https://github.com/python-excel/xlrd) - *pip install xlrd*
+
+For Gmail support:
+* [gmail](https://github.com/charlierguo/gmail)
diff --git a/iocp.py b/iocp.py
@@ -40,6 +40,7 @@
 import fnmatch
 import argparse
 import re
+import csv
 from StringIO import StringIO
 try:
     import configparser as ConfigParser
@@ -48,6 +49,18 @@
 
 # Import optional third-party libraries
 IMPORTS = []
+try:
+    import xlrd
+    IMPORTS.append('xlrd')
+except ImportError:
+    pass
+
+try:
+    import gmail
+    IMPORTS.append('gmail')
+except ImportError:
+    pass
+
 try:
     from PyPDF2 import PdfFileReader
     IMPORTS.append('pypdf2')
@@ -81,14 +94,24 @@ class IOC_Parser(object):
     patterns = {}
     defang = {}
 
-    def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='pdfminer', output_format='csv', output_handler=None):
+    def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='pdfminer', output_format='csv', proxy=None, output_handler=None):
         basedir = os.path.dirname(os.path.abspath(__file__))
         if patterns_ini is None:
             patterns_ini = os.path.join(basedir, 'patterns.ini')
 
         self.load_patterns(patterns_ini)
         self.whitelist = WhiteList(basedir)
         self.dedup = dedup
+
+        # Depending on the type of proxy, set the proper proxy setting for storage to be used with Requests
+        if proxy is not None:
+            if proxy.startswith('http://'):
+                self.proxy = {'http': proxy}
+            elif proxy.startswith('https://'):
+                self.proxy = {'https': proxy}
+        else:
+            self.proxy = proxy
+
         if output_handler:
             self.handler = output_handler
         else:
@@ -111,6 +134,14 @@ def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='
             if 'beautifulsoup' not in IMPORTS:
                 e = 'HTML parser library not found: BeautifulSoup'
                 raise ImportError(e)
+        elif input_format == 'xlsx':
+            if 'xlrd' not in IMPORTS:
+                e = 'XLRD Library not found. Please visit: https://github.com/python-excel/xlrd or pip install xlrd'
+                raise ImportError(e)
+        elif input_format == 'gmail':
+            if 'gmail' not in IMPORTS:
+                e = 'Gmail library not found. Please visit: https://github.com/charlierguo/gmail'
+                raise ImportError(e)
 
     def load_patterns(self, fpath):
         config = ConfigParser.ConfigParser()
@@ -144,7 +175,20 @@ def is_whitelisted(self, ind_match, ind_type):
             pass
         return False
 
-    def parse_page(self, fpath, data, page_num):
+    def parse_page(self, fpath, data, page_num, flag=0, sheet_name=''):
+        """ Added flag and sheet_name variables for new inputs to help properly
+        print output
+
+        @param fpath: the file path, directory, URL or email account
+        @param data: the data to be parsed
+        @param page_num: the page number of a pdf, line number of csv, xls or xlsx
+        @param flag:
+            0 = default (pdf/txt/html)
+            1 = gmail
+            2 = csv
+            3 = xls and xlsx
+        @param sheet_name: to be used only with Excel spreadsheets
+        """
         for ind_type, ind_regex in self.patterns.items():
             matches = ind_regex.findall(data)
 
@@ -164,7 +208,8 @@ def parse_page(self, fpath, data, page_num):
 
                     self.dedup_store.add((ind_type, ind_match))
 
-                self.handler.print_match(fpath, page_num, ind_type, ind_match)
+                # Added flag and sheet_name to determine which type of output to display
+                self.handler.print_match(fpath, page_num, ind_type, ind_match, flag, sheet_name)
 
     def parse_pdf_pypdf2(self, f, fpath):
         try:
@@ -246,7 +291,7 @@ def parse_html(self, f, fpath):
                 self.dedup_store = set()
 
             data = f.read()
-            soup = BeautifulSoup(data)
+            soup = BeautifulSoup(data, 'html.parser')   # Add "html.parser" to supress user warning
             html = soup.findAll(text=True)
 
             text = u''
@@ -266,15 +311,139 @@ def parse_html(self, f, fpath):
         except Exception as e:
             self.handler.print_error(fpath, e)
 
+    def parse_csv(self, f, fpath):
+        """ This method is used to parse a csv file. The flag
+        used for this method to send to output.py is 2.
+
+        @author Robb Krasnow
+        """
+        try:
+            if self.dedup:
+                self.dedup_store = set()
+
+            self.handler.print_header(fpath)
+
+            with open(fpath, 'rb') as csvfile:
+                csv_data = csv.reader(csvfile, delimiter=',', quotechar='|')
+
+                for row in csv_data:
+                    line = ', '.join(row).rstrip()
+                    unicode_output = unicode(line, 'ascii', errors='ignore')
+
+                    self.parse_page(fpath, unicode_output, csv_data.line_num, 2)
+
+            self.handler.print_footer(fpath)
+        except (KeyboardInterrupt, SystemExit):
+            raise
+        except Exception as e:
+            self.handler.print_error(fpath, e)
+
+
+    def parse_xls(self, f, fpath):
+        """ Created this function just to allow a user to use 'xls' as an input
+        option without any errors.
+
+        @author Robb Krasnow
+        """
+        self.parse_xlsx(f, fpath)
+
+
+    def parse_xlsx(self, f, fpath):
+        """ This method is used to parse Microsoft Excel files
+        with either .xls or .xlsx extentions. The flag
+        used for this method to send to output.py is 3. Because
+        Excel spreadsheets may have multiple tabs, the sheet's
+        name is passed through the parse_page method in turn showing
+        that in the output.
+
+        @author Robb Krasnow
+        """
+        try:
+            if self.dedup:
+                self.dedup_store = set()
+
+            self.handler.print_header(fpath)
+            workbook = xlrd.open_workbook(fpath)
+            sheets = workbook.sheets()
+
+            for sheet in sheets:
+                sheet_name = sheet.name
+
+                for row in range(sheet.nrows):
+                    for col in range(sheet.ncols):
+                        if sheet.cell_value(row, col) is not xlrd.empty_cell.value:
+                            val = repr(sheet.cell_value(row, col))
+
+                            self.parse_page(fpath, val, row+1, 3, sheet_name)
+
+            self.handler.print_footer(fpath)
+        except (KeyboardInterrupt, SystemExit):
+            raise
+        except Exception as e:
+            self.handler.print_error(fpath, e)
+
+
+    def parse_gmail(self, username, password):
+        """ This method is used to parse the inbox of a valid 
+        Gmail account. The flag used for this method to send to
+        output.py is 1.
+
+        @author                 Robb Krasnow
+        @param      username    The gmail account's username
+        @param      password    The gmail account's password
+        """
+        try:
+            if self.dedup:
+                self.dedup_store = set()
+
+            # Log the user in
+            g = gmail.login(username, password)
+
+            # When the user is logged in, grab all the email from their inbox
+            # and parse all the messages for IOCs
+            if g.logged_in:
+                print '***** Login Successful. *****\n'
+
+                self.handler.print_header(username)
+                emails = g.inbox().mail()
+
+                for email in range(0, len(emails)):
+                    try:
+                        emails[email].fetch()
+                        content = emails[email].body
+                        subject = re.sub('(^\s|re:\s+|\r\n|fwd:\s+)', '', emails[email].subject, flags=re.IGNORECASE)
+
+                        self.parse_page(subject, content, 1, 1)
+                    except Exception as e:
+                        continue
+
+                self.handler.print_footer(username)
+
+                print '\n***** %s emails found. *****' % len(emails)
+                g.logout()
+                print '***** Logout Successful. *****'
+            else:
+                sys.exit()
+        except gmail.exceptions.AuthenticationError:
+            print 'Authentication Error'
+            sys.exit()
+
+
     def parse(self, path):
         try:
             if path.startswith('http://') or path.startswith('https://'):
                 if 'requests' not in IMPORTS:
                     e = 'HTTP library not found: requests'
                     raise ImportError(e)
                 headers = { 'User-Agent': 'Mozilla/5.0 Gecko Firefox' }
-                r = requests.get(path, headers=headers)
-                r.raise_for_status()
+
+                # If using proxy, make request with proxy from --proxy switch
+                # Otherwise make the call normally
+                if self.proxy is not None:
+                    r = requests.get(path, headers=headers, proxies=self.proxy)
+                else:
+                    r = requests.get(path, headers=headers)
+
                 f = StringIO(r.content)
                 self.parser_func(f, path)
                 return
@@ -289,6 +458,15 @@ def parse(self, path):
                         with open(fpath, 'rb') as f:
                             self.parser_func(f, fpath)
                 return
+            # Check if the input from CLI has @gmail.com attached
+            # If so, grab the credentials, and send them to parse_gmail()
+            elif path.count('@gmail.com ') == 1 and len(path.split()) == 2:
+                gmail_account = path.split()
+                username = gmail_account[0]
+                password = gmail_account[1]
+                self.parser_func(username, password)
+
+                return
 
             e = 'File path is not a file, directory or URL: %s' % (path)
             raise IOError(e)
@@ -299,13 +477,14 @@ def parse(self, path):
 
 if __name__ == "__main__":
     argparser = argparse.ArgumentParser()
-    argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)')
+    argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)/Gmail account in double quotes ("[email protected] password")')
     argparser.add_argument('-p', dest='INI', default=None, help='Pattern file')
-    argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html)')
+    argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html/csv/xls/xlsx/gmail)')
     argparser.add_argument('-o', dest='OUTPUT_FORMAT', default='csv', help='Output format (csv/json/yara/netflow)')
     argparser.add_argument('-d', dest='DEDUP', action='store_true', default=False, help='Deduplicate matches')
     argparser.add_argument('-l', dest='LIB', default='pdfminer', help='PDF parsing library (pypdf2/pdfminer)')
+    argparser.add_argument('--proxy', dest='PROXY', default=None, help='Sets proxy (http(s)://server:port)')
     args = argparser.parse_args()
 
-    parser = IOC_Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT)
+    parser = IOC_Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT, args.PROXY)
     parser.parse(args.PATH)