11
11
import argparse
12
12
from collections import namedtuple
13
13
import queue
14
+ import gzip
14
15
import json
15
16
import logging
16
17
import sys
20
21
21
22
from netflow import parse_packet , TemplateNotRecognized , UnknownNetFlowVersion
22
23
23
-
24
24
logger = logging .getLogger (__name__ )
25
25
26
26
# Amount of time to wait before dropping an undecodable ExportPacket
@@ -122,7 +122,7 @@ def run(self):
122
122
else :
123
123
to_retry .append (pkt )
124
124
logger .debug ("Failed to decode a v9 ExportPacket - will "
125
- "re-attempt when a new template is discovered" )
125
+ "re-attempt when a new template is discovered" )
126
126
continue
127
127
128
128
logger .debug ("Processed a v%d ExportPacket with %d flows." ,
@@ -172,8 +172,8 @@ def get_export_packets(host, port):
172
172
parser .add_argument ("--port" , "-p" , type = int , default = 2055 ,
173
173
help = "collector listener port" )
174
174
parser .add_argument ("--file" , "-o" , type = str , dest = "output_file" ,
175
- default = "{}.json " .format (int (time .time ())),
176
- help = "collector export JSON file" )
175
+ default = "{}.gz " .format (int (time .time ())),
176
+ help = "collector export multiline JSON file" )
177
177
parser .add_argument ("--debug" , "-D" , action = "store_true" ,
178
178
help = "Enable debug output" )
179
179
args = parser .parse_args ()
@@ -183,19 +183,26 @@ def get_export_packets(host, port):
183
183
if args .debug :
184
184
logger .setLevel (logging .DEBUG )
185
185
186
- data = {}
187
186
try :
188
- # TODO: For a long-running processes, this will consume loads of memory
187
+ # With every parsed flow a new line is appended to the output file. In previous versions, this was implemented
188
+ # by storing the whole data dict in memory and dumping it regularly onto disk. This was extremely fragile, as
189
+ # it a) consumed a lot of memory and CPU (dropping packets since storing one flow took longer than the arrival
190
+ # of the next flow) and b) broke the exported JSON file, if the collector crashed during the write process,
191
+ # rendering all collected flows during the runtime of the collector useless (the file contained one large JSON
192
+ # dict which represented the 'data' dict).
193
+
194
+ # In this new approach, each received flow is parsed as usual, but it gets appended to a gzipped file each time.
195
+ # All in all, this improves in three aspects:
196
+ # 1. collected flow data is not stored in memory any more
197
+ # 2. received and parsed flows are persisted reliably
198
+ # 3. the disk usage of files with JSON and its full strings as keys is reduced by using gzipped files
199
+ # This also means that the files have to be handled differently, because they are gzipped and not formatted as
200
+ # one single big JSON dump, but rather many little JSON dumps, separated by line breaks.
189
201
for ts , export in get_export_packets (args .host , args .port ):
190
- data [ts ] = [flow .data for flow in export .flows ]
202
+ entry = {ts : [flow .data for flow in export .flows ]}
203
+ line = json .dumps (entry ).encode () + b"\n " # byte encoded line
204
+ with gzip .open (args .output_file , "ab" ) as fh : # open as append, not reading the whole file
205
+ fh .write (line )
191
206
except KeyboardInterrupt :
192
207
logger .info ("Received KeyboardInterrupt, passing through" )
193
208
pass
194
-
195
- if data :
196
- # TODO: this should be done periodically to not lose any data (only saved in memory)
197
- logger .info ("Outputting collected data to '%s'" , args .output_file )
198
- with open (args .output_file , 'w' ) as f :
199
- json .dump (data , f )
200
- else :
201
- logger .info ("No data collected" )
0 commit comments