From f3abd7abef7286a53515a526d70631a50b54f23d Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Mon, 14 Oct 2024 12:26:31 +0900 Subject: [PATCH] DedupeTool: Create WarcWriter on demand This prevents an empty gzip member from being appended to the end of the output when no records were deduplicated. --- src/org/netpreserve/jwarc/tools/DedupeTool.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/org/netpreserve/jwarc/tools/DedupeTool.java b/src/org/netpreserve/jwarc/tools/DedupeTool.java index 5348f40..f78b042 100644 --- a/src/org/netpreserve/jwarc/tools/DedupeTool.java +++ b/src/org/netpreserve/jwarc/tools/DedupeTool.java @@ -25,8 +25,12 @@ public class DedupeTool { public void deduplicateWarcFile(Path infile, Path outfile) throws IOException { try (FileChannel input = FileChannel.open(infile); WarcReader reader = new WarcReader(input); - FileChannel output = FileChannel.open(outfile, WRITE, CREATE, TRUNCATE_EXISTING); - WarcWriter writer = new WarcWriter(output, reader.compression())) { + FileChannel output = FileChannel.open(outfile, WRITE, CREATE, TRUNCATE_EXISTING)) { + + // We create the WarcWriter on demand so that if no records are deduplicated we don't write an empty + // gzip member at the end of the file. + WarcWriter writer = null; + WarcRecord record = reader.next().orElse(null); while (record != null) { long position = reader.position(); @@ -40,6 +44,7 @@ record = reader.next().orElse(null); transferExactly(input, position, length, output); } else { if (verbose) System.out.println("Writing revisit for " + position + ":" + length); + if (writer == null) writer = new WarcWriter(output, reader.compression()); writer.write(revisit); } }