composefs · cgwalters · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -13,6 +13,7 @@ cargo-fuzz = true
 [dependencies]
 arbitrary = { version = "1", features = ["derive"] }
 libfuzzer-sys = "0.4"
+rand = { version = "0.9", features = ["small_rng"] }
 tar = "0.4"
 
 [dependencies.tar-core]

diff --git a/fuzz/fuzz_targets/differential.rs b/fuzz/fuzz_targets/differential.rs
@@ -5,11 +5,23 @@
 //! tar-core must never panic. A secondary goal is that whenever tar-rs
 //! successfully parses an entry, tar-core should produce a matching entry
 //! with equivalent metadata, xattrs, and identical content bytes.
+//!
+//! ## Behavioral difference allowlist
+//!
+//! tar-core is intentionally stricter than tar-rs in some areas. When
+//! tar-core rejects an archive that tar-rs accepts, we check whether the
+//! error falls into a known category of expected divergence before
+//! failing the test. Current allowlisted differences:
+//!
+//! - **Malformed PAX records**: tar-core propagates PAX parse errors
+//!   (malformed record format, non-UTF-8 keys). tar-rs silently skips
+//!   malformed PAX records via `.flatten()`.
 
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use tar_core_testutil::{parse_tar_core, parse_tar_rs, OwnedEntry};
+use tar_core::parse::{Limits, ParseError};
+use tar_core_testutil::{parse_tar_core_detailed, parse_tar_rs, OwnedEntry};
 
 /// Dump the raw 512-byte headers from the (post-fixup) data to stderr.
 fn dump_headers(data: &[u8]) {
@@ -30,33 +42,68 @@ fn dump_headers(data: &[u8]) {
     }
 }
 
+/// Returns true if the error is a known behavioral difference where
+/// tar-core is intentionally stricter than tar-rs.
+///
+/// When this returns true, tar-rs may have parsed more entries than
+/// tar-core, and that's expected.
+fn is_allowlisted_divergence(err: &ParseError) -> bool {
+    matches!(
+        err,
+        // tar-core rejects malformed PAX records; tar-rs silently skips them.
+        ParseError::Pax(_) | ParseError::InvalidUtf8(_)
+    )
+}
+
 /// Compare entries parsed by tar-rs and tar-core, asserting equivalence.
 ///
 /// tar-core is intentionally more lenient than tar-rs in some cases (e.g.
 /// all-null numeric fields are accepted as 0), so we only require that
 /// tar-core parses *at least* as many entries as tar-rs and that those
 /// entries match.
-fn compare_entries(data: &[u8], tar_rs_entries: &[OwnedEntry], tar_core_entries: &[OwnedEntry]) {
-    if tar_core_entries.len() < tar_rs_entries.len() {
-        eprintln!(
-            "entry count mismatch: tar-core={} tar-rs={}",
-            tar_core_entries.len(),
-            tar_rs_entries.len()
-        );
-        dump_headers(data);
-        for (i, e) in tar_rs_entries.iter().enumerate() {
-            eprintln!("tar-rs  [{i}]: {e:?}");
-        }
-        for (i, e) in tar_core_entries.iter().enumerate() {
-            eprintln!("tar-core[{i}]: {e:?}");
+fn compare_entries(
+    data: &[u8],
+    tar_rs_entries: &[OwnedEntry],
+    tar_core_entries: &[OwnedEntry],
+    tar_core_error: Option<&ParseError>,
+) {
+    let count_mismatch = tar_core_entries.len() < tar_rs_entries.len();
+
+    if count_mismatch {
+        // Check the behavioral difference allowlist: if tar-core stopped
+        // due to a known stricter-than-tar-rs error, the entry count
+        // divergence is expected. We still verify the entries that *were*
+        // parsed match (fall through to the zip comparison below).
+        let allowlisted = tar_core_error.is_some_and(|err| is_allowlisted_divergence(err));
+
+        if !allowlisted {
+            eprintln!(
+                "entry count mismatch: tar-core={} tar-rs={}",
+                tar_core_entries.len(),
+                tar_rs_entries.len()
+            );
+            if let Some(err) = tar_core_error {
+                eprintln!("tar-core error: {err}");
+            }
+            dump_headers(data);
+            for (i, e) in tar_rs_entries.iter().enumerate() {
+                eprintln!("tar-rs  [{i}]: {e:?}");
+            }
+            for (i, e) in tar_core_entries.iter().enumerate() {
+                eprintln!("tar-core[{i}]: {e:?}");
+            }
+            panic!(
+                "tar-core parsed fewer entries than tar-rs: tar-core={} tar-rs={}",
+                tar_core_entries.len(),
+                tar_rs_entries.len(),
+            );
         }
-        panic!(
-            "tar-core parsed fewer entries than tar-rs: tar-core={} tar-rs={}",
-            tar_core_entries.len(),
-            tar_rs_entries.len(),
-        );
     }
 
+    // Compare entries that both parsers produced. When there's an
+    // allowlisted count mismatch, zip stops at the shorter list,
+    // verifying that tar-core's entries are correct up to the point
+    // where it diverged.
     for (i, (rs, core)) in tar_rs_entries.iter().zip(tar_core_entries).enumerate() {
         if rs != core {
             eprintln!("mismatch at entry {i}:");
@@ -130,6 +177,17 @@ fn parse_octal_simple(bytes: &[u8]) -> Option<u64> {
     u64::from_str_radix(s, 8).ok()
 }
 
+fn run_differential(data: &[u8]) {
+    let tar_rs_entries = parse_tar_rs(data);
+    let result = parse_tar_core_detailed(data, Limits::permissive());
+    compare_entries(
+        data,
+        &tar_rs_entries,
+        &result.entries,
+        result.error.as_ref(),
+    );
+}
+
 fuzz_target!(|data: &[u8]| {
     if data.len() > 256 * 1024 {
         return;
@@ -142,12 +200,8 @@ fuzz_target!(|data: &[u8]| {
     if should_fixup {
         let mut data = data.to_vec();
         fixup_checksums(&mut data);
-        let tar_rs_entries = parse_tar_rs(&data);
-        let tar_core_entries = parse_tar_core(&data);
-        compare_entries(&data, &tar_rs_entries, &tar_core_entries);
+        run_differential(&data);
     } else {
-        let tar_rs_entries = parse_tar_rs(data);
-        let tar_core_entries = parse_tar_core(data);
-        compare_entries(data, &tar_rs_entries, &tar_core_entries);
+        run_differential(data);
     }
 });
diff --git a/fuzz/fuzz_targets/parse.rs b/fuzz/fuzz_targets/parse.rs
@@ -5,39 +5,69 @@
 //! - Padded size is always >= size and block-aligned (or both zero).
 //! - Parsed entry paths are never empty.
 //! - Total consumed bytes never exceed the input length.
+//!
+//! The parser is exercised with variable-length short reads to stress the
+//! NeedData/retry path that real callers hit with partial I/O. Each parse
+//! call gets a different chunk size from a seeded RNG, simulating realistic
+//! non-uniform read patterns.
 
 #![no_main]
 
+use std::mem::size_of;
+
 use libfuzzer_sys::fuzz_target;
+use rand::rngs::SmallRng;
+use rand::{Rng, SeedableRng};
 use tar_core::parse::{Limits, ParseEvent, Parser};
 use tar_core::HEADER_SIZE;
 
-/// Drive a parser to completion over `data`, checking invariants on each entry.
-/// Returns normally on errors or NeedData — the point is that it must not panic.
-fn run_parser(data: &[u8], limits: Limits, verify_checksums: bool) {
+/// Max chunk size ceiling for short-read simulation (1 MiB).
+const MAX_CHUNK_CEILING: u32 = 1024 * 1024;
+
+/// Drive a parser to completion over `data`, feeding it variable-sized
+/// chunks drawn from `rng` to simulate realistic partial reads.
+///
+/// On NeedData, the exposed window grows to provide the requested minimum.
+/// After each successfully processed event, a fresh chunk size is drawn
+/// from the RNG so the parser sees different split points throughout.
+///
+/// Checks invariants on each entry and returns normally on errors or
+/// NeedData — the point is that it must not panic.
+fn run_parser(data: &[u8], limits: Limits, verify_checksums: bool, rng: &mut SmallRng) {
     let mut parser = Parser::new(limits);
     parser.set_verify_checksums(verify_checksums);
     let mut offset: usize = 0;
+    let mut window = rng.random_range(1..=MAX_CHUNK_CEILING) as usize;
 
     loop {
         assert!(offset <= data.len(), "offset exceeded input length");
-        let input = &data[offset..];
+        let remaining = data.len() - offset;
+        if remaining == 0 {
+            break;
+        }
+
+        let input = &data[offset..offset + remaining.min(window)];
 
         match parser.parse(input) {
-            Ok(ParseEvent::NeedData { .. }) => break,
+            Ok(ParseEvent::NeedData { min_bytes }) => {
+                if remaining < min_bytes {
+                    break;
+                }
+                // Widen the window to satisfy the parser's request and retry.
+                window = min_bytes;
+                continue;
+            }
 
             Ok(ParseEvent::Entry { consumed, entry })
             | Ok(ParseEvent::SparseEntry {
                 consumed, entry, ..
             }) => {
-                // consumed bytes must not exceed remaining input
                 assert!(
                     consumed <= input.len(),
-                    "consumed {consumed} > remaining {}",
+                    "consumed {consumed} > input len {}",
                     input.len()
                 );
 
-                // Padded-size invariants
                 assert!(
                     entry.padded_size() >= entry.size,
                     "padded_size {} < size {}",
@@ -55,39 +85,39 @@ fn run_parser(data: &[u8], limits: Limits, verify_checksums: bool) {
                     );
                 }
 
-                // Path must not be empty
                 assert!(!entry.path.is_empty(), "entry path is empty");
 
                 offset += consumed;
 
-                // Skip content + padding; if not enough data remains, bail out.
                 let padded = entry.padded_size() as usize;
                 if offset.saturating_add(padded) > data.len() {
                     break;
                 }
                 offset += padded;
+
+                window = rng.random_range(1..=MAX_CHUNK_CEILING) as usize;
             }
 
             Ok(ParseEvent::GlobalExtensions { consumed, .. }) => {
                 assert!(
                     consumed <= input.len(),
-                    "GlobalExtensions consumed {consumed} > remaining {}",
+                    "GlobalExtensions consumed {consumed} > input len {}",
                     input.len()
                 );
                 offset += consumed;
+                window = rng.random_range(1..=MAX_CHUNK_CEILING) as usize;
             }
 
             Ok(ParseEvent::End { consumed }) => {
                 assert!(
                     consumed <= input.len(),
-                    "End consumed {consumed} > remaining {}",
+                    "End consumed {consumed} > input len {}",
                     input.len()
                 );
                 offset += consumed;
                 break;
             }
 
-            // Parse errors are expected on fuzzed input — just stop.
             Err(_) => break,
         }
     }
@@ -99,15 +129,24 @@ fn run_parser(data: &[u8], limits: Limits, verify_checksums: bool) {
     );
 }
 
+/// Byte offset where the tar payload begins (after the config/seed header).
+const PAYLOAD_OFFSET: usize = 1 + size_of::<u64>();
+
 fuzz_target!(|data: &[u8]| {
-    // 90% of the time, skip checksum verification to exercise deeper parser
-    // logic (PAX extensions, GNU long name/link, sparse files, field parsing,
-    // etc.). Random fuzz input almost never has valid checksums, so without
-    // this the fuzzer would break immediately on every input.
-    //
-    // 10% of the time, verify checksums normally to test that code path too.
-    let skip_checksums = !data.is_empty() && data[0] % 10 != 0;
-
-    run_parser(data, Limits::permissive(), !skip_checksums);
-    run_parser(data, Limits::default(), !skip_checksums);
+    if data.len() < PAYLOAD_OFFSET {
+        return;
+    }
+
+    // First byte: checksum behavior.
+    // 90% skip checksums, 10% verify them.
+    let skip_checksums = data[0] % 10 != 0;
+
+    // Bytes 1..9: seed for the chunk-size RNG.
+    let seed = u64::from_le_bytes(data[1..PAYLOAD_OFFSET].try_into().unwrap());
+    let mut rng = SmallRng::seed_from_u64(seed);
+
+    let payload = &data[PAYLOAD_OFFSET..];
+
+    run_parser(payload, Limits::permissive(), !skip_checksums, &mut rng);
+    run_parser(payload, Limits::default(), !skip_checksums, &mut rng);
 });