diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index b833407..260144d 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -13,6 +13,7 @@ cargo-fuzz = true [dependencies] arbitrary = { version = "1", features = ["derive"] } libfuzzer-sys = "0.4" +rand = { version = "0.9", features = ["small_rng"] } tar = "0.4" [dependencies.tar-core] diff --git a/fuzz/fuzz_targets/differential.rs b/fuzz/fuzz_targets/differential.rs index e6a83b1..330c3a6 100644 --- a/fuzz/fuzz_targets/differential.rs +++ b/fuzz/fuzz_targets/differential.rs @@ -5,11 +5,23 @@ //! tar-core must never panic. A secondary goal is that whenever tar-rs //! successfully parses an entry, tar-core should produce a matching entry //! with equivalent metadata, xattrs, and identical content bytes. +//! +//! ## Behavioral difference allowlist +//! +//! tar-core is intentionally stricter than tar-rs in some areas. When +//! tar-core rejects an archive that tar-rs accepts, we check whether the +//! error falls into a known category of expected divergence before +//! failing the test. Current allowlisted differences: +//! +//! - **Malformed PAX records**: tar-core propagates PAX parse errors +//! (malformed record format, non-UTF-8 keys). tar-rs silently skips +//! malformed PAX records via `.flatten()`. #![no_main] use libfuzzer_sys::fuzz_target; -use tar_core_testutil::{parse_tar_core, parse_tar_rs, OwnedEntry}; +use tar_core::parse::{Limits, ParseError}; +use tar_core_testutil::{parse_tar_core_detailed, parse_tar_rs, OwnedEntry}; /// Dump the raw 512-byte headers from the (post-fixup) data to stderr. fn dump_headers(data: &[u8]) { @@ -30,33 +42,68 @@ fn dump_headers(data: &[u8]) { } } +/// Returns true if the error is a known behavioral difference where +/// tar-core is intentionally stricter than tar-rs. +/// +/// When this returns true, tar-rs may have parsed more entries than +/// tar-core, and that's expected. +fn is_allowlisted_divergence(err: &ParseError) -> bool { + matches!( + err, + // tar-core rejects malformed PAX records; tar-rs silently skips them. + ParseError::Pax(_) | ParseError::InvalidUtf8(_) + ) +} + /// Compare entries parsed by tar-rs and tar-core, asserting equivalence. /// /// tar-core is intentionally more lenient than tar-rs in some cases (e.g. /// all-null numeric fields are accepted as 0), so we only require that /// tar-core parses *at least* as many entries as tar-rs and that those /// entries match. -fn compare_entries(data: &[u8], tar_rs_entries: &[OwnedEntry], tar_core_entries: &[OwnedEntry]) { - if tar_core_entries.len() < tar_rs_entries.len() { - eprintln!( - "entry count mismatch: tar-core={} tar-rs={}", - tar_core_entries.len(), - tar_rs_entries.len() - ); - dump_headers(data); - for (i, e) in tar_rs_entries.iter().enumerate() { - eprintln!("tar-rs [{i}]: {e:?}"); - } - for (i, e) in tar_core_entries.iter().enumerate() { - eprintln!("tar-core[{i}]: {e:?}"); +fn compare_entries( + data: &[u8], + tar_rs_entries: &[OwnedEntry], + tar_core_entries: &[OwnedEntry], + tar_core_error: Option<&ParseError>, +) { + let count_mismatch = tar_core_entries.len() < tar_rs_entries.len(); + + if count_mismatch { + // Check the behavioral difference allowlist: if tar-core stopped + // due to a known stricter-than-tar-rs error, the entry count + // divergence is expected. We still verify the entries that *were* + // parsed match (fall through to the zip comparison below). + let allowlisted = tar_core_error.is_some_and(|err| is_allowlisted_divergence(err)); + + if !allowlisted { + eprintln!( + "entry count mismatch: tar-core={} tar-rs={}", + tar_core_entries.len(), + tar_rs_entries.len() + ); + if let Some(err) = tar_core_error { + eprintln!("tar-core error: {err}"); + } + dump_headers(data); + for (i, e) in tar_rs_entries.iter().enumerate() { + eprintln!("tar-rs [{i}]: {e:?}"); + } + for (i, e) in tar_core_entries.iter().enumerate() { + eprintln!("tar-core[{i}]: {e:?}"); + } + panic!( + "tar-core parsed fewer entries than tar-rs: tar-core={} tar-rs={}", + tar_core_entries.len(), + tar_rs_entries.len(), + ); } - panic!( - "tar-core parsed fewer entries than tar-rs: tar-core={} tar-rs={}", - tar_core_entries.len(), - tar_rs_entries.len(), - ); } + // Compare entries that both parsers produced. When there's an + // allowlisted count mismatch, zip stops at the shorter list, + // verifying that tar-core's entries are correct up to the point + // where it diverged. for (i, (rs, core)) in tar_rs_entries.iter().zip(tar_core_entries).enumerate() { if rs != core { eprintln!("mismatch at entry {i}:"); @@ -130,6 +177,17 @@ fn parse_octal_simple(bytes: &[u8]) -> Option { u64::from_str_radix(s, 8).ok() } +fn run_differential(data: &[u8]) { + let tar_rs_entries = parse_tar_rs(data); + let result = parse_tar_core_detailed(data, Limits::permissive()); + compare_entries( + data, + &tar_rs_entries, + &result.entries, + result.error.as_ref(), + ); +} + fuzz_target!(|data: &[u8]| { if data.len() > 256 * 1024 { return; @@ -142,12 +200,8 @@ fuzz_target!(|data: &[u8]| { if should_fixup { let mut data = data.to_vec(); fixup_checksums(&mut data); - let tar_rs_entries = parse_tar_rs(&data); - let tar_core_entries = parse_tar_core(&data); - compare_entries(&data, &tar_rs_entries, &tar_core_entries); + run_differential(&data); } else { - let tar_rs_entries = parse_tar_rs(data); - let tar_core_entries = parse_tar_core(data); - compare_entries(data, &tar_rs_entries, &tar_core_entries); + run_differential(data); } }); diff --git a/fuzz/fuzz_targets/parse.rs b/fuzz/fuzz_targets/parse.rs index 0a47457..30f5e1e 100644 --- a/fuzz/fuzz_targets/parse.rs +++ b/fuzz/fuzz_targets/parse.rs @@ -5,39 +5,69 @@ //! - Padded size is always >= size and block-aligned (or both zero). //! - Parsed entry paths are never empty. //! - Total consumed bytes never exceed the input length. +//! +//! The parser is exercised with variable-length short reads to stress the +//! NeedData/retry path that real callers hit with partial I/O. Each parse +//! call gets a different chunk size from a seeded RNG, simulating realistic +//! non-uniform read patterns. #![no_main] +use std::mem::size_of; + use libfuzzer_sys::fuzz_target; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; use tar_core::parse::{Limits, ParseEvent, Parser}; use tar_core::HEADER_SIZE; -/// Drive a parser to completion over `data`, checking invariants on each entry. -/// Returns normally on errors or NeedData — the point is that it must not panic. -fn run_parser(data: &[u8], limits: Limits, verify_checksums: bool) { +/// Max chunk size ceiling for short-read simulation (1 MiB). +const MAX_CHUNK_CEILING: u32 = 1024 * 1024; + +/// Drive a parser to completion over `data`, feeding it variable-sized +/// chunks drawn from `rng` to simulate realistic partial reads. +/// +/// On NeedData, the exposed window grows to provide the requested minimum. +/// After each successfully processed event, a fresh chunk size is drawn +/// from the RNG so the parser sees different split points throughout. +/// +/// Checks invariants on each entry and returns normally on errors or +/// NeedData — the point is that it must not panic. +fn run_parser(data: &[u8], limits: Limits, verify_checksums: bool, rng: &mut SmallRng) { let mut parser = Parser::new(limits); parser.set_verify_checksums(verify_checksums); let mut offset: usize = 0; + let mut window = rng.random_range(1..=MAX_CHUNK_CEILING) as usize; loop { assert!(offset <= data.len(), "offset exceeded input length"); - let input = &data[offset..]; + let remaining = data.len() - offset; + if remaining == 0 { + break; + } + + let input = &data[offset..offset + remaining.min(window)]; match parser.parse(input) { - Ok(ParseEvent::NeedData { .. }) => break, + Ok(ParseEvent::NeedData { min_bytes }) => { + if remaining < min_bytes { + break; + } + // Widen the window to satisfy the parser's request and retry. + window = min_bytes; + continue; + } Ok(ParseEvent::Entry { consumed, entry }) | Ok(ParseEvent::SparseEntry { consumed, entry, .. }) => { - // consumed bytes must not exceed remaining input assert!( consumed <= input.len(), - "consumed {consumed} > remaining {}", + "consumed {consumed} > input len {}", input.len() ); - // Padded-size invariants assert!( entry.padded_size() >= entry.size, "padded_size {} < size {}", @@ -55,39 +85,39 @@ fn run_parser(data: &[u8], limits: Limits, verify_checksums: bool) { ); } - // Path must not be empty assert!(!entry.path.is_empty(), "entry path is empty"); offset += consumed; - // Skip content + padding; if not enough data remains, bail out. let padded = entry.padded_size() as usize; if offset.saturating_add(padded) > data.len() { break; } offset += padded; + + window = rng.random_range(1..=MAX_CHUNK_CEILING) as usize; } Ok(ParseEvent::GlobalExtensions { consumed, .. }) => { assert!( consumed <= input.len(), - "GlobalExtensions consumed {consumed} > remaining {}", + "GlobalExtensions consumed {consumed} > input len {}", input.len() ); offset += consumed; + window = rng.random_range(1..=MAX_CHUNK_CEILING) as usize; } Ok(ParseEvent::End { consumed }) => { assert!( consumed <= input.len(), - "End consumed {consumed} > remaining {}", + "End consumed {consumed} > input len {}", input.len() ); offset += consumed; break; } - // Parse errors are expected on fuzzed input — just stop. Err(_) => break, } } @@ -99,15 +129,24 @@ fn run_parser(data: &[u8], limits: Limits, verify_checksums: bool) { ); } +/// Byte offset where the tar payload begins (after the config/seed header). +const PAYLOAD_OFFSET: usize = 1 + size_of::(); + fuzz_target!(|data: &[u8]| { - // 90% of the time, skip checksum verification to exercise deeper parser - // logic (PAX extensions, GNU long name/link, sparse files, field parsing, - // etc.). Random fuzz input almost never has valid checksums, so without - // this the fuzzer would break immediately on every input. - // - // 10% of the time, verify checksums normally to test that code path too. - let skip_checksums = !data.is_empty() && data[0] % 10 != 0; - - run_parser(data, Limits::permissive(), !skip_checksums); - run_parser(data, Limits::default(), !skip_checksums); + if data.len() < PAYLOAD_OFFSET { + return; + } + + // First byte: checksum behavior. + // 90% skip checksums, 10% verify them. + let skip_checksums = data[0] % 10 != 0; + + // Bytes 1..9: seed for the chunk-size RNG. + let seed = u64::from_le_bytes(data[1..PAYLOAD_OFFSET].try_into().unwrap()); + let mut rng = SmallRng::seed_from_u64(seed); + + let payload = &data[PAYLOAD_OFFSET..]; + + run_parser(payload, Limits::permissive(), !skip_checksums, &mut rng); + run_parser(payload, Limits::default(), !skip_checksums, &mut rng); }); diff --git a/fuzz/fuzz_targets/roundtrip.rs b/fuzz/fuzz_targets/roundtrip.rs index c736b56..2a178fb 100644 --- a/fuzz/fuzz_targets/roundtrip.rs +++ b/fuzz/fuzz_targets/roundtrip.rs @@ -2,12 +2,15 @@ //! verify roundtrip equivalence. //! //! The invariant: if EntryBuilder successfully produces an archive, Parser -//! must parse it back to identical metadata and content. +//! must parse it back to identical metadata and content — even when the +//! parser only receives variable-length short reads from a seeded RNG. #![no_main] use arbitrary::{Arbitrary, Unstructured}; use libfuzzer_sys::fuzz_target; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; use tar_core::builder::EntryBuilder; use tar_core::parse::{Limits, ParseEvent, Parser}; use tar_core::{EntryType, SparseEntry, HEADER_SIZE}; @@ -30,6 +33,8 @@ struct FuzzEntry { sparse_entries: Vec<(u16, u16)>, /// When true and entry_kind selects Regular, build a sparse entry use_sparse: bool, + /// Seed for the short-read chunk size RNG. + rng_seed: u64, } /// Strip NUL bytes, ensure non-empty, clamp length. @@ -192,17 +197,12 @@ fuzz_target!(|data: &[u8]| { // End-of-archive marker archive.extend(std::iter::repeat_n(0u8, HEADER_SIZE * 2)); - // Parse it back + // Parse it back with variable-length short reads to exercise NeedData. + let mut rng = SmallRng::seed_from_u64(entry.rng_seed); let mut parser = Parser::new(Limits::default()); let mut offset = 0; - let input = &archive[offset..]; - let event = match parser.parse(input) { - Ok(e) => e, - other => { - panic!("expected successful parse from archive we just built, got: {other:?}"); - } - }; + let event = parse_short_read(&mut parser, &archive, &mut offset, &mut rng); // Extract parsed entry and verify sparse-specific fields let parsed_entry = match event { @@ -294,10 +294,48 @@ fuzz_target!(|data: &[u8]| { let padded = content.len().next_multiple_of(HEADER_SIZE); offset += padded; - match parser.parse(&archive[offset..]) { - Ok(ParseEvent::End { .. }) => {} + let end_event = parse_short_read(&mut parser, &archive, &mut offset, &mut rng); + match end_event { + ParseEvent::End { .. } => {} other => { panic!("expected End after single entry, got: {other:?}"); } } }); + +/// Max chunk size ceiling for short-read simulation (1 MiB). +const MAX_CHUNK_CEILING: u32 = 1024 * 1024; + +/// Parse the next event from `archive[*offset..]` using variable-length +/// short reads drawn from `rng`, growing the window when the parser +/// requests more via NeedData. +/// +/// Panics if parsing fails or if the archive is truncated (since we +/// built it ourselves, it must always be complete). +fn parse_short_read<'a>( + parser: &mut Parser, + archive: &'a [u8], + offset: &mut usize, + rng: &mut SmallRng, +) -> ParseEvent<'a> { + let mut window = rng.random_range(1..=MAX_CHUNK_CEILING) as usize; + loop { + let remaining = archive.len() - *offset; + assert!(remaining > 0, "unexpected end of archive we just built"); + let input = &archive[*offset..*offset + remaining.min(window)]; + match parser.parse(input) { + Ok(ParseEvent::NeedData { min_bytes }) => { + assert!( + remaining >= min_bytes, + "archive we built is truncated: need {min_bytes}, have {remaining}" + ); + window = min_bytes; + continue; + } + Ok(event) => return event, + Err(e) => { + panic!("expected successful parse from archive we just built, got error: {e}"); + } + } + } +} diff --git a/testutil/src/lib.rs b/testutil/src/lib.rs index 69def71..c5b2c22 100644 --- a/testutil/src/lib.rs +++ b/testutil/src/lib.rs @@ -6,7 +6,7 @@ use std::io::{Cursor, Read}; -use tar_core::parse::{Limits, ParseEvent, Parser}; +use tar_core::parse::{Limits, ParseError, ParseEvent, Parser}; use tar_core::{HEADER_SIZE, PAX_SCHILY_XATTR}; /// Owned snapshot of a parsed tar entry, including content bytes. @@ -39,19 +39,34 @@ pub struct OwnedEntry { /// Maximum content size read per entry (prevents OOM on fuzzed inputs). const MAX_CONTENT_READ: u64 = 256 * 1024; +/// Result of parsing with tar-core: entries collected so far plus an +/// optional terminal error (if parsing stopped due to an error rather +/// than reaching end-of-archive or running out of data). +pub struct TarCoreParseResult { + pub entries: Vec, + pub error: Option, +} + /// Parse a tar archive with tar-core's sans-IO parser using permissive limits. pub fn parse_tar_core(data: &[u8]) -> Vec { - parse_tar_core_with_limits(data, Limits::permissive()) + parse_tar_core_detailed(data, Limits::permissive()).entries } /// Parse a tar archive with tar-core using the given limits. pub fn parse_tar_core_with_limits(data: &[u8], limits: Limits) -> Vec { + parse_tar_core_detailed(data, limits).entries +} + +/// Parse a tar archive with tar-core, returning both entries and +/// any terminal parse error. +pub fn parse_tar_core_detailed(data: &[u8], limits: Limits) -> TarCoreParseResult { let mut results = Vec::new(); let mut parser = Parser::new(limits); // Allow entries with empty paths so we don't stop parsing early. // We skip them below to match parse_tar_rs's behaviour. parser.set_allow_empty_path(true); let mut offset = 0; + let mut terminal_error = None; loop { if offset > data.len() { @@ -143,11 +158,17 @@ pub fn parse_tar_core_with_limits(data: &[u8], limits: Limits) -> Vec break, - Err(_) => break, + Err(e) => { + terminal_error = Some(e); + break; + } } } - results + TarCoreParseResult { + entries: results, + error: terminal_error, + } } /// Parse a tar archive with the `tar` crate, returning owned entries.