fix(lexarg): Use official encoded_bytes API

epage · epage · commit 23d54d9936b4 · 2024-02-19T15:43:13.000-06:00
diff --git a/crates/lexarg/src/ext.rs b/crates/lexarg/src/ext.rs
@@ -2,6 +2,8 @@ use std::ffi::OsStr;
 
 pub(crate) trait OsStrExt: private::Sealed {
     /// Converts to a string slice.
+    /// The Utf8Error is guaranteed to have a valid UTF8 boundary
+    /// in its `valid_up_to()`
     fn try_str(&self) -> Result<&str, std::str::Utf8Error>;
     /// Returns `true` if the given pattern matches a sub-slice of
     /// this string slice.
@@ -35,7 +37,7 @@ pub(crate) trait OsStrExt: private::Sealed {
 
 impl OsStrExt for OsStr {
     fn try_str(&self) -> Result<&str, std::str::Utf8Error> {
-        let bytes = to_bytes(self);
+        let bytes = self.as_encoded_bytes();
         std::str::from_utf8(bytes)
     }
 
@@ -44,22 +46,22 @@ impl OsStrExt for OsStr {
     }
 
     fn find(&self, needle: &str) -> Option<usize> {
-        let bytes = to_bytes(self);
+        let bytes = self.as_encoded_bytes();
         (0..=self.len().checked_sub(needle.len())?)
             .find(|&x| bytes[x..].starts_with(needle.as_bytes()))
     }
 
     fn strip_prefix(&self, prefix: &str) -> Option<&OsStr> {
-        let bytes = to_bytes(self);
+        let bytes = self.as_encoded_bytes();
         bytes.strip_prefix(prefix.as_bytes()).map(|s| {
             // SAFETY:
-            // - This came from `to_bytes`
-            // - Since `prefix` is `&str`, any split will be along UTF-8 boundarie
-            unsafe { to_os_str_unchecked(s) }
+            // - This came from `as_encoded_bytes`
+            // - Since `prefix` is `&str`, any split will be along UTF-8 boundary
+            unsafe { OsStr::from_encoded_bytes_unchecked(s) }
         })
     }
     fn starts_with(&self, prefix: &str) -> bool {
-        let bytes = to_bytes(self);
+        let bytes = self.as_encoded_bytes();
         bytes.starts_with(prefix.as_bytes())
     }
 
@@ -74,13 +76,18 @@ impl OsStrExt for OsStr {
     fn split_once(&self, needle: &'_ str) -> Option<(&OsStr, &OsStr)> {
         let start = self.find(needle)?;
         let end = start + needle.len();
-        let haystack = to_bytes(self);
+        let haystack = self.as_encoded_bytes();
         let first = &haystack[0..start];
         let second = &haystack[end..];
         // SAFETY:
-        // - This came from `to_bytes`
-        // - Since `needle` is `&str`, any split will be along UTF-8 boundarie
-        unsafe { Some((to_os_str_unchecked(first), to_os_str_unchecked(second))) }
+        // - This came from `as_encoded_bytes`
+        // - Since `needle` is `&str`, any split will be along UTF-8 boundary
+        unsafe {
+            Some((
+                OsStr::from_encoded_bytes_unchecked(first),
+                OsStr::from_encoded_bytes_unchecked(second),
+            ))
+        }
     }
 }
 
@@ -90,45 +97,6 @@ mod private {
     impl Sealed for std::ffi::OsStr {}
 }
 
-/// Allow access to raw bytes
-///
-/// As the non-UTF8 encoding is not defined, the bytes only make sense when compared with
-/// 7-bit ASCII or `&str`
-///
-/// # Compatibility
-///
-/// There is no guarantee how non-UTF8 bytes will be encoded, even within versions of this crate
-/// (since its dependent on rustc)
-fn to_bytes(s: &OsStr) -> &[u8] {
-    // SAFETY:
-    // - Lifetimes are the same
-    // - Types are compatible (`OsStr` is effectively a transparent wrapper for `[u8]`)
-    // - The primary contract is that the encoding for invalid surrogate code points is not
-    //   guaranteed which isn't a problem here
-    //
-    // There is a proposal to support this natively (https://github.com/rust-lang/rust/pull/95290)
-    // but its in limbo
-    unsafe { std::mem::transmute(s) }
-}
-
-/// Restore raw bytes as `OsStr`
-///
-/// # Safety
-///
-/// - `&[u8]` must either by a `&str` or originated with `to_bytes` within the same binary
-/// - Any splits of the original `&[u8]` must be done along UTF-8 boundaries
-unsafe fn to_os_str_unchecked(s: &[u8]) -> &OsStr {
-    // SAFETY:
-    // - Lifetimes are the same
-    // - Types are compatible (`OsStr` is effectively a transparent wrapper for `[u8]`)
-    // - The primary contract is that the encoding for invalid surrogate code points is not
-    //   guaranteed which isn't a problem here
-    //
-    // There is a proposal to support this natively (https://github.com/rust-lang/rust/pull/95290)
-    // but its in limbo
-    std::mem::transmute(s)
-}
-
 pub struct Split<'s, 'n> {
     haystack: Option<&'s OsStr>,
     needle: &'n str,
@@ -161,7 +129,10 @@ impl<'s, 'n> Iterator for Split<'s, 'n> {
 ///
 /// `index` must be at a valid UTF-8 boundary
 pub(crate) unsafe fn split_at(os: &OsStr, index: usize) -> (&OsStr, &OsStr) {
-    let bytes = to_bytes(os);
+    let bytes = os.as_encoded_bytes();
     let (first, second) = bytes.split_at(index);
-    (to_os_str_unchecked(first), to_os_str_unchecked(second))
+    (
+        OsStr::from_encoded_bytes_unchecked(first),
+        OsStr::from_encoded_bytes_unchecked(second),
+    )
 }