composefs · cgwalters · Mar 2, 2026
diff --git a/fuzz/fuzz_targets/differential.rs b/fuzz/fuzz_targets/differential.rs
@@ -9,16 +9,10 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use tar_core_testutil::{parse_tar_core, parse_tar_rs};
-
-fuzz_target!(|data: &[u8]| {
-    if data.len() > 256 * 1024 {
-        return;
-    }
-
-    let tar_rs_entries = parse_tar_rs(data);
-    let tar_core_entries = parse_tar_core(data);
+use tar_core_testutil::{parse_tar_core, parse_tar_rs, OwnedEntry};
 
+/// Compare entries parsed by tar-rs and tar-core, asserting equivalence.
+fn compare_entries(tar_rs_entries: &[OwnedEntry], tar_core_entries: &[OwnedEntry]) {
     assert_eq!(
         tar_core_entries.len(),
         tar_rs_entries.len(),
@@ -27,10 +21,7 @@ fuzz_target!(|data: &[u8]| {
         tar_rs_entries.len(),
     );
 
-    for i in 0..tar_rs_entries.len() {
-        let rs = &tar_rs_entries[i];
-        let core = &tar_core_entries[i];
-
+    for (i, (rs, core)) in tar_rs_entries.iter().zip(tar_core_entries).enumerate() {
         assert_eq!(
             rs.path,
             core.path,
@@ -68,4 +59,88 @@ fuzz_target!(|data: &[u8]| {
         );
         assert_eq!(rs.xattrs, core.xattrs, "xattr mismatch at entry {i}");
     }
+}
+
+/// Preprocess fuzz input to fix up tar header checksums.
+///
+/// Walks through 512-byte aligned blocks. For each non-zero block (potential
+/// header), computes and sets a valid checksum. Then attempts to parse the
+/// size field to skip over content blocks, advancing to the next header.
+///
+/// This dramatically improves fuzzing coverage by allowing the parser to get
+/// past the checksum verification gate and exercise deeper parsing logic
+/// (PAX extensions, GNU long name/link, sparse files, etc.).
+fn fixup_checksums(data: &mut [u8]) {
+    let mut offset = 0;
+    while offset + 512 <= data.len() {
+        let block = &data[offset..offset + 512];
+
+        // Skip zero blocks (end-of-archive markers)
+        if block.iter().all(|&b| b == 0) {
+            offset += 512;
+            continue;
+        }
+
+        // Fill checksum field (bytes 148..156) with spaces
+        let block = &mut data[offset..offset + 512];
+        block[148..156].fill(b' ');
+
+        // Compute checksum: unsigned sum of all 512 bytes
+        let checksum: u64 = block.iter().map(|&b| u64::from(b)).sum();
+
+        // Encode as 7 octal digits + NUL terminator
+        let cksum_str = format!("{:07o}\0", checksum);
+        let cksum_bytes = cksum_str.as_bytes();
+        let copy_len = cksum_bytes.len().min(8);
+        block[148..148 + copy_len].copy_from_slice(&cksum_bytes[..copy_len]);
+
+        offset += 512;
+
+        // Try to parse the size field (bytes 124..136) to skip content blocks
+        let size_field = &data[offset - 512 + 124..offset - 512 + 136];
+        if let Some(size) = parse_octal_simple(size_field) {
+            let padded = ((size as usize) + 511) & !511;
+            if offset + padded <= data.len() {
+                offset += padded;
+            }
+        }
+    }
+}
+
+/// Simple octal parser for the size field - doesn't need to handle base-256
+/// since we're just trying to skip content. Returns None on any parse failure.
+fn parse_octal_simple(bytes: &[u8]) -> Option<u64> {
+    let trimmed: Vec<u8> = bytes
+        .iter()
+        .copied()
+        .skip_while(|&b| b == b' ')
+        .take_while(|&b| b != b' ' && b != 0)
+        .collect();
+    if trimmed.is_empty() {
+        return Some(0);
+    }
+    let s = core::str::from_utf8(&trimmed).ok()?;
+    u64::from_str_radix(s, 8).ok()
+}
+
+fuzz_target!(|data: &[u8]| {
+    if data.len() > 256 * 1024 {
+        return;
+    }
+
+    // 90% of the time, fix up checksums to exercise deeper parser logic.
+    // 10% of the time, pass raw bytes to test checksum validation itself.
+    let should_fixup = !data.is_empty() && data[0] % 10 != 0;
+
+    if should_fixup {
+        let mut data = data.to_vec();
+        fixup_checksums(&mut data);
+        let tar_rs_entries = parse_tar_rs(&data);
+        let tar_core_entries = parse_tar_core(&data);
+        compare_entries(&tar_rs_entries, &tar_core_entries);
+    } else {
+        let tar_rs_entries = parse_tar_rs(data);
+        let tar_core_entries = parse_tar_core(data);
+        compare_entries(&tar_rs_entries, &tar_core_entries);
+    }
 });
diff --git a/fuzz/fuzz_targets/parse.rs b/fuzz/fuzz_targets/parse.rs
@@ -98,9 +98,80 @@ fn run_parser(data: &[u8], limits: Limits) {
     );
 }
 
+/// Preprocess fuzz input to fix up tar header checksums.
+///
+/// Walks through 512-byte aligned blocks. For each non-zero block (potential
+/// header), computes and sets a valid checksum. Then attempts to parse the
+/// size field to skip over content blocks, advancing to the next header.
+///
+/// This dramatically improves fuzzing coverage by allowing the parser to get
+/// past the checksum verification gate and exercise deeper parsing logic
+/// (PAX extensions, GNU long name/link, sparse files, etc.).
+fn fixup_checksums(data: &mut [u8]) {
+    let mut offset = 0;
+    while offset + 512 <= data.len() {
+        let block = &data[offset..offset + 512];
+
+        // Skip zero blocks (end-of-archive markers)
+        if block.iter().all(|&b| b == 0) {
+            offset += 512;
+            continue;
+        }
+
+        // Fill checksum field (bytes 148..156) with spaces
+        let block = &mut data[offset..offset + 512];
+        block[148..156].fill(b' ');
+
+        // Compute checksum: unsigned sum of all 512 bytes
+        let checksum: u64 = block.iter().map(|&b| u64::from(b)).sum();
+
+        // Encode as 7 octal digits + NUL terminator
+        let cksum_str = format!("{:07o}\0", checksum);
+        let cksum_bytes = cksum_str.as_bytes();
+        let copy_len = cksum_bytes.len().min(8);
+        block[148..148 + copy_len].copy_from_slice(&cksum_bytes[..copy_len]);
+
+        offset += 512;
+
+        // Try to parse the size field (bytes 124..136) to skip content blocks
+        let size_field = &data[offset - 512 + 124..offset - 512 + 136];
+        if let Some(size) = parse_octal_simple(size_field) {
+            let padded = ((size as usize) + 511) & !511;
+            if offset + padded <= data.len() {
+                offset += padded;
+            }
+        }
+    }
+}
+
+/// Simple octal parser for the size field - doesn't need to handle base-256
+/// since we're just trying to skip content. Returns None on any parse failure.
+fn parse_octal_simple(bytes: &[u8]) -> Option<u64> {
+    let trimmed: Vec<u8> = bytes
+        .iter()
+        .copied()
+        .skip_while(|&b| b == b' ')
+        .take_while(|&b| b != b' ' && b != 0)
+        .collect();
+    if trimmed.is_empty() {
+        return Some(0);
+    }
+    let s = core::str::from_utf8(&trimmed).ok()?;
+    u64::from_str_radix(s, 8).ok()
+}
+
 fuzz_target!(|data: &[u8]| {
-    // Run with permissive limits (should accept anything that isn't structurally broken).
-    run_parser(data, Limits::permissive());
-    // Run with default limits (stricter — may error on oversized paths/pax, but must not panic).
-    run_parser(data, Limits::default());
+    // 90% of the time, fix up checksums to exercise deeper parser logic.
+    // 10% of the time, pass raw bytes to test checksum validation itself.
+    let should_fixup = !data.is_empty() && data[0] % 10 != 0;
+
+    if should_fixup {
+        let mut data = data.to_vec();
+        fixup_checksums(&mut data);
+        run_parser(&data, Limits::permissive());
+        run_parser(&data, Limits::default());
+    } else {
+        run_parser(data, Limits::permissive());
+        run_parser(data, Limits::default());
+    }
 });
diff --git a/src/parse.rs b/src/parse.rs
@@ -644,6 +644,12 @@ pub struct Parser {
     /// When true, entries with empty paths are allowed through instead of
     /// returning [`ParseError::EmptyPath`].
     allow_empty_path: bool,
+    /// When false, header checksum verification is skipped. This is useful
+    /// for fuzzing, where random input almost never has valid checksums,
+    /// preventing the fuzzer from exercising deeper parser logic.
+    ///
+    /// Default: `true`.
+    verify_checksums: bool,
 }
 
 impl Parser {
@@ -655,6 +661,7 @@ impl Parser {
             state: State::ReadHeader,
             pending: PendingMetadata::default(),
             allow_empty_path: false,
+            verify_checksums: true,
         }
     }
 
@@ -664,6 +671,19 @@ impl Parser {
         self.allow_empty_path = allow;
     }
 
+    /// Control whether header checksums are verified during parsing.
+    ///
+    /// When set to `false`, the parser skips [`Header::verify_checksum`]
+    /// calls, accepting headers regardless of their checksum field. This
+    /// is primarily useful for fuzz testing, where random input almost
+    /// never produces valid checksums, preventing the fuzzer from reaching
+    /// deeper parser code paths.
+    ///
+    /// Default: `true`.
+    pub fn set_verify_checksums(&mut self, verify: bool) {
+        self.verify_checksums = verify;
+    }
+
     /// Create a new parser with default limits.
     #[must_use]
     pub fn with_defaults() -> Self {
@@ -756,7 +776,9 @@ impl Parser {
 
         // Parse header
         let header = Header::from_bytes(header_bytes);
-        header.verify_checksum()?;
+        if self.verify_checksums {
+            header.verify_checksum()?;
+        }
 
         let entry_type = header.entry_type();
         let size = header.entry_size()?;