Refactor storing data and writing to disk - using gzip and lines

bitkeks · bitkeks · commit 6b9d20c8a6e2 · 2019-11-03T12:02:05.000+01:00
In previous versions, collected flows (parsed data) were stored in memory by the collector. In regular intervals, or at shutdown, this one single dict was dumped as JSON onto disk. With this commit, the behaviour is changed to line-based JSON dumps for each flow, gzipped onto disk for storage efficiency. The analyze_json is updated as well to handle the new gzipped files in the new format. See the comments in main.py for more details. Fixes #10
diff --git a/analyze_json.py b/analyze_json.py
@@ -13,16 +13,20 @@
 import contextlib
 from datetime import datetime
 import functools
+import gzip
 import ipaddress
 import json
+import logging
+import os.path
 import socket
 import sys
 
 
 Pair = namedtuple('Pair', ['src', 'dest'])
+logger = logging.getLogger(__name__)
 
 
-@functools.lru_cache(maxsize=128)
+@functools.lru_cache(maxsize=None)
 def resolve_hostname(ip):
     return socket.getfqdn(ip)
 
@@ -139,14 +143,38 @@ def service(self):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Output a basic analysis of NetFlow data")
-    parser.add_argument('filename', nargs='?', type=argparse.FileType('r'),
-                        default=sys.stdin,
+    parser.add_argument('-f', '--file', dest='file', type=str, default=sys.stdin,
                         help="The file to analyze (defaults to stdin if not provided)")
     args = parser.parse_args()
 
-    data = json.load(args.filename)
+    # Using a file and using stdin differ in their further usage for gzip.open
+    file = args.file
+    mode = "rb"  # reading files
+    if file != sys.stdin and not os.path.exists(file):
+        exit("File {} does not exist!".format(file))
+
+    if file == sys.stdin:
+        file = sys.stdin.buffer
+        mode = "rt"  # reading from stdin
+
+    data = {}
+
+    with gzip.open(file, mode) as gzipped:
+        # "for line in" lazy-loads all lines in the file
+        for line in gzipped:
+            entry = json.loads(line)
+            if len(entry.keys()) != 1:
+                logger.warning("Line \"{}\" does not have exactly one timestamp key.")
+
+            try:
+                ts = list(entry)[0]  # timestamp from key
+            except KeyError:
+                logger.error("Saved line \"{}\" has no timestamp key!".format(line))
+                continue
+
+            data[ts] = entry[ts]
 
-    # Go through data and disect every flow saved inside the dump
+    # Go through data and dissect every flow saved inside the dump
     for key in sorted(data):
         timestamp = datetime.fromtimestamp(float(key)).strftime("%Y-%m-%d %H:%M.%S")
 
@@ -157,7 +185,7 @@ def service(self):
                 pending = flow
                 continue
             con = Connection(pending, flow)
-            print("{timestamp}: {service:7} | {size:8} | {duration:9} | {src_host} ({src}) to {dest_host} ({dest})" \
+            print("{timestamp}: {service:<10} | {size:8} | {duration:9} | {src_host} ({src}) to {dest_host} ({dest})" \
                 .format(timestamp=timestamp, service=con.service.upper(), src_host=con.hostnames.src, src=con.src,
                         dest_host=con.hostnames.dest, dest=con.dest, size=con.human_size, duration=con.human_duration))
             pending = None
diff --git a/main.py b/main.py
@@ -11,6 +11,7 @@
 import argparse
 from collections import namedtuple
 import queue
+import gzip
 import json
 import logging
 import sys
@@ -20,7 +21,6 @@
 
 from netflow import parse_packet, TemplateNotRecognized, UnknownNetFlowVersion
 
-
 logger = logging.getLogger(__name__)
 
 # Amount of time to wait before dropping an undecodable ExportPacket
@@ -122,7 +122,7 @@ def run(self):
                     else:
                         to_retry.append(pkt)
                         logger.debug("Failed to decode a v9 ExportPacket - will "
-                                      "re-attempt when a new template is discovered")
+                                     "re-attempt when a new template is discovered")
                     continue
 
                 logger.debug("Processed a v%d ExportPacket with %d flows.",
@@ -172,8 +172,8 @@ def get_export_packets(host, port):
     parser.add_argument("--port", "-p", type=int, default=2055,
                         help="collector listener port")
     parser.add_argument("--file", "-o", type=str, dest="output_file",
-                        default="{}.json".format(int(time.time())),
-                        help="collector export JSON file")
+                        default="{}.gz".format(int(time.time())),
+                        help="collector export multiline JSON file")
     parser.add_argument("--debug", "-D", action="store_true",
                         help="Enable debug output")
     args = parser.parse_args()
@@ -183,19 +183,26 @@ def get_export_packets(host, port):
     if args.debug:
         logger.setLevel(logging.DEBUG)
 
-    data = {}
     try:
-        # TODO: For a long-running processes, this will consume loads of memory
+        # With every parsed flow a new line is appended to the output file. In previous versions, this was implemented
+        # by storing the whole data dict in memory and dumping it regularly onto disk. This was extremely fragile, as
+        # it a) consumed a lot of memory and CPU (dropping packets since storing one flow took longer than the arrival
+        # of the next flow) and b) broke the exported JSON file, if the collector crashed during the write process,
+        # rendering all collected flows during the runtime of the collector useless (the file contained one large JSON
+        # dict which represented the 'data' dict).
+
+        # In this new approach, each received flow is parsed as usual, but it gets appended to a gzipped file each time.
+        # All in all, this improves in three aspects:
+        # 1. collected flow data is not stored in memory any more
+        # 2. received and parsed flows are persisted reliably
+        # 3. the disk usage of files with JSON and its full strings as keys is reduced by using gzipped files
+        # This also means that the files have to be handled differently, because they are gzipped and not formatted as
+        # one single big JSON dump, but rather many little JSON dumps, separated by line breaks.
         for ts, export in get_export_packets(args.host, args.port):
-            data[ts] = [flow.data for flow in export.flows]
+            entry = {ts: [flow.data for flow in export.flows]}
+            line = json.dumps(entry).encode() + b"\n"  # byte encoded line
+            with gzip.open(args.output_file, "ab") as fh:  # open as append, not reading the whole file
+                fh.write(line)
     except KeyboardInterrupt:
         logger.info("Received KeyboardInterrupt, passing through")
         pass
-
-    if data:
-        # TODO: this should be done periodically to not lose any data (only saved in memory)
-        logger.info("Outputting collected data to '%s'", args.output_file)
-        with open(args.output_file, 'w') as f:
-            json.dump(data, f)
-    else:
-        logger.info("No data collected")