Skip to content

Commit

Permalink
DedupeTool: Create WarcWriter on demand
Browse files Browse the repository at this point in the history
This prevents an empty gzip member from being appended to the end of the
output when no records were deduplicated.
  • Loading branch information
ato committed Oct 14, 2024
1 parent af5fb49 commit f3abd7a
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions src/org/netpreserve/jwarc/tools/DedupeTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,12 @@ public class DedupeTool {
public void deduplicateWarcFile(Path infile, Path outfile) throws IOException {
try (FileChannel input = FileChannel.open(infile);
WarcReader reader = new WarcReader(input);
FileChannel output = FileChannel.open(outfile, WRITE, CREATE, TRUNCATE_EXISTING);
WarcWriter writer = new WarcWriter(output, reader.compression())) {
FileChannel output = FileChannel.open(outfile, WRITE, CREATE, TRUNCATE_EXISTING)) {

// We create the WarcWriter on demand so that if no records are deduplicated we don't write an empty
// gzip member at the end of the file.
WarcWriter writer = null;

WarcRecord record = reader.next().orElse(null);
while (record != null) {
long position = reader.position();
Expand All @@ -40,6 +44,7 @@ record = reader.next().orElse(null);
transferExactly(input, position, length, output);
} else {
if (verbose) System.out.println("Writing revisit for " + position + ":" + length);
if (writer == null) writer = new WarcWriter(output, reader.compression());
writer.write(revisit);
}
}
Expand Down

0 comments on commit f3abd7a

Please sign in to comment.