@@ -2004,6 +2004,93 @@ mod tests {
20042004 }
20052005 }
20062006
2007+ #[ test]
2008+ #[ allow( invalid_from_utf8) ]
2009+ fn test_parser_pax_non_utf8_path ( ) {
2010+ // Non-UTF-8 bytes in PAX path values are accepted, matching the
2011+ // pragmatic behavior of Go's archive/tar and the Rust tar crate.
2012+ // The POSIX spec says PAX path values SHOULD be UTF-8, but real-world
2013+ // archives (e.g. from Docker/BuildKit) may contain non-UTF-8 paths.
2014+ // See bootc-dev/bootc#2073 for a concrete example.
2015+ let non_utf8_path: & [ u8 ] = b"etc/ssl/certs/F\xf5 tan\xfa s\xed tv\xe1 ny.pem" ;
2016+ assert ! (
2017+ core:: str :: from_utf8( non_utf8_path) . is_err( ) ,
2018+ "test data must be non-UTF-8"
2019+ ) ;
2020+
2021+ let mut archive = Vec :: new ( ) ;
2022+ archive. extend ( make_pax_header ( & [ ( "path" , non_utf8_path) ] ) ) ;
2023+ archive. extend_from_slice ( & make_header ( b"placeholder.pem" , 0 , b'0' ) ) ;
2024+ archive. extend ( zeroes ( 1024 ) ) ;
2025+
2026+ let mut parser = Parser :: new ( Limits :: default ( ) ) ;
2027+ let event = parser. parse ( & archive) . unwrap ( ) ;
2028+
2029+ match event {
2030+ ParseEvent :: Entry { entry, .. } => {
2031+ assert_eq ! ( entry. path. as_ref( ) , non_utf8_path) ;
2032+ // The lossy accessor should replace invalid bytes
2033+ let lossy = entry. path_lossy ( ) ;
2034+ assert ! (
2035+ lossy. contains( '\u{FFFD}' ) ,
2036+ "lossy conversion should have replacement chars"
2037+ ) ;
2038+ }
2039+ other => panic ! ( "Expected Entry, got {:?}" , other) ,
2040+ }
2041+ }
2042+
2043+ #[ test]
2044+ fn test_pax_non_utf8_path_roundtrip ( ) {
2045+ // Verify that a non-UTF-8 PAX path survives a builder -> parser
2046+ // roundtrip. The path must exceed 100 bytes to trigger PAX emission.
2047+ use crate :: builder:: EntryBuilder ;
2048+
2049+ // 101+ byte path with non-UTF-8 bytes embedded
2050+ let mut long_path =
2051+ b"a/very/deep/directory/structure/that/needs/to/exceed/one/hundred/bytes/\xf0 \xf1 \xf2 /"
2052+ . to_vec ( ) ;
2053+ long_path. extend ( b"and/some/more/nested/dirs/to/be/safe/file.bin" ) ;
2054+ assert ! ( long_path. len( ) > 100 , "path must exceed 100 bytes" ) ;
2055+ assert ! (
2056+ core:: str :: from_utf8( & long_path) . is_err( ) ,
2057+ "path must contain non-UTF-8"
2058+ ) ;
2059+
2060+ let mut builder = EntryBuilder :: new_ustar ( ) ;
2061+ builder
2062+ . path ( & long_path)
2063+ . mode ( 0o644 )
2064+ . unwrap ( )
2065+ . size ( 5 )
2066+ . unwrap ( )
2067+ . mtime ( 0 )
2068+ . unwrap ( )
2069+ . uid ( 0 )
2070+ . unwrap ( )
2071+ . gid ( 0 )
2072+ . unwrap ( ) ;
2073+
2074+ let mut archive = Vec :: new ( ) ;
2075+ archive. extend_from_slice ( & builder. finish_bytes ( ) ) ;
2076+ // 5 bytes of content, padded to 512
2077+ let mut content_block = [ 0u8 ; 512 ] ;
2078+ content_block[ ..5 ] . copy_from_slice ( b"hello" ) ;
2079+ archive. extend_from_slice ( & content_block) ;
2080+ archive. extend ( zeroes ( 1024 ) ) ;
2081+
2082+ let mut parser = Parser :: new ( Limits :: default ( ) ) ;
2083+ let event = parser. parse ( & archive) . unwrap ( ) ;
2084+
2085+ match event {
2086+ ParseEvent :: Entry { entry, .. } => {
2087+ assert_eq ! ( entry. path. as_ref( ) , long_path. as_slice( ) ) ;
2088+ assert_eq ! ( entry. size, 5 ) ;
2089+ }
2090+ other => panic ! ( "Expected Entry, got {:?}" , other) ,
2091+ }
2092+ }
2093+
20072094 #[ test]
20082095 fn test_parser_pax_size_override ( ) {
20092096 // PAX header should override the size in the actual header
@@ -2186,6 +2273,29 @@ mod tests {
21862273 }
21872274 }
21882275
2276+ #[ test]
2277+ #[ allow( invalid_from_utf8) ]
2278+ fn test_parser_pax_non_utf8_linkpath ( ) {
2279+ // Non-UTF-8 bytes in PAX linkpath values should be preserved.
2280+ let non_utf8_target: & [ u8 ] = b"targets/\xc0 \xc1 invalid.so" ;
2281+ assert ! ( core:: str :: from_utf8( non_utf8_target) . is_err( ) ) ;
2282+
2283+ let mut archive = Vec :: new ( ) ;
2284+ archive. extend ( make_pax_header ( & [ ( "linkpath" , non_utf8_target) ] ) ) ;
2285+ archive. extend_from_slice ( & make_header ( b"link.so" , 0 , b'2' ) ) ; // symlink
2286+ archive. extend ( zeroes ( 1024 ) ) ;
2287+
2288+ let mut parser = Parser :: new ( Limits :: default ( ) ) ;
2289+ let event = parser. parse ( & archive) . unwrap ( ) ;
2290+
2291+ match event {
2292+ ParseEvent :: Entry { entry, .. } => {
2293+ assert_eq ! ( entry. link_target. as_deref( ) , Some ( non_utf8_target. as_ref( ) ) ) ;
2294+ }
2295+ other => panic ! ( "Expected Entry, got {:?}" , other) ,
2296+ }
2297+ }
2298+
21892299 // =========================================================================
21902300 // PAX global header tests
21912301 // =========================================================================
@@ -2453,6 +2563,79 @@ mod tests {
24532563 }
24542564 }
24552565
2566+ #[ test]
2567+ fn test_parser_pax_before_gnu_long_name ( ) {
2568+ // PAX 'x' -> GNU 'L' -> real entry: this is what tar-rs's builder
2569+ // produces when you call append_pax_extensions() (e.g. for xattrs)
2570+ // followed by append_data() with a long path. The PAX metadata
2571+ // should still be associated with the real entry, and PAX path
2572+ // (if present) should take precedence over the GNU long name.
2573+ //
2574+ // This ordering matters for ecosystem compatibility with bootc
2575+ // (see bootc-dev/bootc#2073).
2576+ let gnu_name =
2577+ "gnu/long/name/that/exceeds/one/hundred/bytes/" . to_string ( ) + & "g" . repeat ( 60 ) ;
2578+ let xattr_value = b"some xattr value" ;
2579+
2580+ let mut archive = Vec :: new ( ) ;
2581+ // PAX header first (with xattr but no path -- simulating bootc's
2582+ // copy_entry which strips path/linkpath from PAX)
2583+ archive. extend ( make_pax_header ( & [ (
2584+ "SCHILY.xattr.user.test" ,
2585+ xattr_value. as_slice ( ) ,
2586+ ) ] ) ) ;
2587+ // GNU long name second
2588+ archive. extend ( make_gnu_long_name ( gnu_name. as_bytes ( ) ) ) ;
2589+ // Real entry last
2590+ archive. extend_from_slice ( & make_header ( b"placeholder" , 0 , b'0' ) ) ;
2591+ archive. extend ( zeroes ( 1024 ) ) ;
2592+
2593+ let mut parser = Parser :: new ( Limits :: default ( ) ) ;
2594+ let event = parser. parse ( & archive) . unwrap ( ) ;
2595+
2596+ match event {
2597+ ParseEvent :: Entry { entry, .. } => {
2598+ // GNU long name should be used (no PAX path to override it)
2599+ assert_eq ! ( entry. path. as_ref( ) , gnu_name. as_bytes( ) ) ;
2600+ // PAX xattr should still be preserved
2601+ assert ! ( entry. pax. is_some( ) ) ;
2602+ let pax = PaxExtensions :: new ( entry. pax . unwrap ( ) ) ;
2603+ let xattr = pax
2604+ . filter_map ( |e| e. ok ( ) )
2605+ . find ( |e| e. key_bytes ( ) . starts_with ( b"SCHILY.xattr." ) ) ;
2606+ assert ! ( xattr. is_some( ) , "xattr should be preserved" ) ;
2607+ assert_eq ! ( xattr. unwrap( ) . value_bytes( ) , xattr_value) ;
2608+ }
2609+ other => panic ! ( "Expected Entry, got {:?}" , other) ,
2610+ }
2611+ }
2612+
2613+ #[ test]
2614+ fn test_parser_pax_path_overrides_gnu_long_name_reversed_order ( ) {
2615+ // Same as test_parser_combined_gnu_pax but with reversed ordering:
2616+ // PAX 'x' (with path) -> GNU 'L' -> real entry.
2617+ // PAX path should still win regardless of order.
2618+ let gnu_name = "gnu/long/name/" . to_string ( ) + & "g" . repeat ( 100 ) ;
2619+ let pax_path = "pax/should/still/win/file.txt" ;
2620+
2621+ let mut archive = Vec :: new ( ) ;
2622+ // PAX first this time (reversed from test_parser_combined_gnu_pax)
2623+ archive. extend ( make_pax_header ( & [ ( "path" , pax_path. as_bytes ( ) ) ] ) ) ;
2624+ archive. extend ( make_gnu_long_name ( gnu_name. as_bytes ( ) ) ) ;
2625+ archive. extend_from_slice ( & make_header ( b"header.txt" , 0 , b'0' ) ) ;
2626+ archive. extend ( zeroes ( 1024 ) ) ;
2627+
2628+ let mut parser = Parser :: new ( Limits :: default ( ) ) ;
2629+ let event = parser. parse ( & archive) . unwrap ( ) ;
2630+
2631+ match event {
2632+ ParseEvent :: Entry { entry, .. } => {
2633+ assert_eq ! ( entry. path. as_ref( ) , pax_path. as_bytes( ) ) ;
2634+ }
2635+ other => panic ! ( "Expected Entry, got {:?}" , other) ,
2636+ }
2637+ }
2638+
24562639 #[ test]
24572640 fn test_parser_gnu_long_name_and_link_combined ( ) {
24582641 // Both GNU long name and long link for the same entry
0 commit comments