From 2b80b80cac4ea4661504c8783d972f46e01ae50f Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Thu, 24 Mar 2022 14:23:16 -0700 Subject: [PATCH 1/2] Document that `OsString` and `OsStr` are bytes; provide conversions to bytes This commit consists almost entirely of documentation, updating the module documentation in `std::ffi` and the type documentation on `OsString`. This only adds two methods: `OsStr::as_bytes` and `OsString::into_vec`, and plumbs those methods down to the existing implementations for both UNIX and Windows. --- .../src/debuginfo/line_info.rs | 2 +- compiler/rustc_fs_util/src/lib.rs | 2 +- library/std/src/ffi/mod.rs | 30 +++++--- library/std/src/ffi/os_str.rs | 76 +++++++++++++++---- library/std/src/os/unix/ffi/mod.rs | 8 -- library/std/src/sys/unix/os_str.rs | 10 +++ library/std/src/sys/windows/os_str.rs | 10 +++ library/std/src/sys_common/wtf8.rs | 13 ++-- src/test/ui/env-funky-keys.rs | 2 +- 9 files changed, 112 insertions(+), 41 deletions(-) diff --git a/compiler/rustc_codegen_cranelift/src/debuginfo/line_info.rs b/compiler/rustc_codegen_cranelift/src/debuginfo/line_info.rs index 476d6a54e1256..cc558352bcf3b 100644 --- a/compiler/rustc_codegen_cranelift/src/debuginfo/line_info.rs +++ b/compiler/rustc_codegen_cranelift/src/debuginfo/line_info.rs @@ -39,7 +39,7 @@ fn osstr_as_utf8_bytes(path: &OsStr) -> &[u8] { #[cfg(unix)] { use std::os::unix::ffi::OsStrExt; - path.as_bytes() + OsStrExt::as_bytes(path) } #[cfg(not(unix))] { diff --git a/compiler/rustc_fs_util/src/lib.rs b/compiler/rustc_fs_util/src/lib.rs index 87e97c746ef56..abc188059680d 100644 --- a/compiler/rustc_fs_util/src/lib.rs +++ b/compiler/rustc_fs_util/src/lib.rs @@ -82,7 +82,7 @@ pub fn path_to_c_string(p: &Path) -> CString { use std::ffi::OsStr; use std::os::unix::ffi::OsStrExt; let p: &OsStr = p.as_ref(); - CString::new(p.as_bytes()).unwrap() + CString::new(OsStrExt::as_bytes(p)).unwrap() } #[cfg(windows)] pub fn path_to_c_string(p: &Path) -> CString { diff --git a/library/std/src/ffi/mod.rs b/library/std/src/ffi/mod.rs index 13e3dacc30d63..d3971334462c0 100644 --- a/library/std/src/ffi/mod.rs +++ b/library/std/src/ffi/mod.rs @@ -90,7 +90,8 @@ //! exists you will get a [Some]\(os_string), which you can //! *then* try to convert to a Rust string. This yields a [`Result`], so that //! your code can detect errors in case the environment variable did -//! not in fact contain valid Unicode data. +//! not in fact contain valid Unicode data. You can also process the `OsString` directly, such as +//! by using it as a filename. //! //! * [`OsStr`] losslessly represents a borrowed reference to a platform string. //! However, this representation is not necessarily in a form native to the platform. @@ -99,17 +100,28 @@ //! //! # Conversions //! +//! ## On all platforms +//! +//! On all platforms, `OsStr` and `OsString` consist of a sequence of bytes; see [`OsString`] for +//! more details on its encoding on different platforms. +//! +//! `OsStr` provides the method [`OsStr::as_bytes`], which provides a zero-cost conversion to a +//! byte slice. (`OsString` provides this method as well, along with all other `OsStr` methods, via +//! `Deref`.) +//! +//! `OsString` provides the method [`OsString::into_vec`], which provides a zero-cost conversion to +//! `Vec`. +//! //! ## On Unix //! //! On Unix, [`OsStr`] implements the //! std::os::unix::ffi::[OsStrExt][unix.OsStrExt] trait, which -//! augments it with two methods, [`from_bytes`] and [`as_bytes`]. -//! These do inexpensive conversions from and to UTF-8 byte slices. +//! augments it with an additional method [`from_bytes`], providing a zero-cost conversion from a +//! byte slice. //! //! Additionally, on Unix [`OsString`] implements the -//! std::os::unix::ffi::[OsStringExt][unix.OsStringExt] trait, -//! which provides [`from_vec`] and [`into_vec`] methods that consume -//! their arguments, and take or produce vectors of [`u8`]. +//! std::os::unix::ffi::[OsStringExt][unix.OsStringExt] trait, which provides the +//! [`from_vec`] method that consumes a `Vec` and produces an `OsString`. //! //! ## On Windows //! @@ -119,8 +131,8 @@ //! On Windows, [`OsStr`] implements the //! std::os::windows::ffi::[OsStrExt][windows.OsStrExt] trait, //! which provides an [`encode_wide`] method. This provides an -//! iterator that can be [`collect`]ed into a vector of [`u16`]. After a nul -//! characters is appended, this is the same as a native Windows string. +//! iterator that can be [`collect`]ed into a vector of [`u16`]. After a 16-bit nul +//! character is appended, this is the same as a native Windows string. //! //! Additionally, on Windows [`OsString`] implements the //! std::os::windows:ffi::[OsStringExt][windows.OsStringExt] @@ -133,10 +145,8 @@ //! [`env::var_os()`]: crate::env::var_os "env::var_os" //! [unix.OsStringExt]: crate::os::unix::ffi::OsStringExt "os::unix::ffi::OsStringExt" //! [`from_vec`]: crate::os::unix::ffi::OsStringExt::from_vec "os::unix::ffi::OsStringExt::from_vec" -//! [`into_vec`]: crate::os::unix::ffi::OsStringExt::into_vec "os::unix::ffi::OsStringExt::into_vec" //! [unix.OsStrExt]: crate::os::unix::ffi::OsStrExt "os::unix::ffi::OsStrExt" //! [`from_bytes`]: crate::os::unix::ffi::OsStrExt::from_bytes "os::unix::ffi::OsStrExt::from_bytes" -//! [`as_bytes`]: crate::os::unix::ffi::OsStrExt::as_bytes "os::unix::ffi::OsStrExt::as_bytes" //! [`OsStrExt`]: crate::os::unix::ffi::OsStrExt "os::unix::ffi::OsStrExt" //! [windows.OsStrExt]: crate::os::windows::ffi::OsStrExt "os::windows::ffi::OsStrExt" //! [`encode_wide`]: crate::os::windows::ffi::OsStrExt::encode_wide "os::windows::ffi::OsStrExt::encode_wide" diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs index 9b5e5d6c0cc4b..e4d0ea58d18db 100644 --- a/library/std/src/ffi/os_str.rs +++ b/library/std/src/ffi/os_str.rs @@ -30,20 +30,27 @@ use crate::sys_common::{AsInner, FromInner, IntoInner}; /// /// `OsString` and [`OsStr`] bridge this gap by simultaneously representing Rust /// and platform-native string values, and in particular allowing a Rust string -/// to be converted into an "OS" string with no cost if possible. A consequence -/// of this is that `OsString` instances are *not* `NUL` terminated; in order -/// to pass to e.g., Unix system call, you should create a [`CStr`]. +/// to be converted into an "OS" string with no cost. A consequence of this is +/// that `OsString` instances are *not* `NUL` terminated; in order to pass to +/// e.g., a Unix system call, you should create a [`CStr`]. /// -/// `OsString` is to &[OsStr] as [`String`] is to &[str]: the former -/// in each pair are owned strings; the latter are borrowed -/// references. +/// `OsString` is to &[OsStr] as [`String`] is to &[str]: `OsString` is +/// an owned string like `String, while `&OsStr` is a borrowed reference like `&str`. /// -/// Note, `OsString` and [`OsStr`] internally do not necessarily hold strings in -/// the form native to the platform; While on Unix, strings are stored as a -/// sequence of 8-bit values, on Windows, where strings are 16-bit value based -/// as just discussed, strings are also actually stored as a sequence of 8-bit -/// values, encoded in a less-strict variant of UTF-8. This is useful to -/// understand when handling capacity and length values. +/// Note that `OsString` and [`OsStr`] internally do not necessarily hold strings in the form +/// native to the platform. On all platforms, `OsString` and `OsStr` consist of a sequence of +/// bytes, in a superset of UTF-8; any valid UTF-8 sequence is a valid `OsString` or `OsStr`. +/// * On Unix, these bytes can contain any values, in an arbitrary encoding (not necessarily +/// UTF-8, and not necessarily the same encoding for different OS strings). +/// * On Windows, where the native OS uses a sequence of 16-bit values, `OsString` and `OsStr` +/// still consist of a sequence of 8-bit values, encoded in a superset of UTF-8 called +/// ["WTF-8"](https://simonsapin.github.io/wtf-8/) ("Wobbly Translation Format 8-bit"). The +/// WTF-8 format allows encoding arbitrary 16-bit values, including unpaired UTF-16 surrogates +/// that do not constitute valid Unicode, since Windows accepts sequences of arbitrary 16-bit +/// values. (In practice, Windows filenames and similar are almost always valid UTF-16.) +/// +/// Capacity and length values are always in terms of the sequence of bytes, not characters or +/// 16-bit values. /// /// # Creating an `OsString` /// @@ -65,8 +72,16 @@ use crate::sys_common::{AsInner, FromInner, IntoInner}; /// /// # Conversions /// +/// `OsStr` provides the method [`OsStr::as_bytes`], which provides a zero-cost conversion to a +/// byte slice. (`OsString` provides this method as well, along with all other `OsStr` methods, via +/// `Deref`.) +/// +/// `OsString` provides the method [`OsString::into_vec`], which provides a zero-cost conversion to +/// `Vec`. +/// /// See the [module's toplevel documentation about conversions][conversions] for a discussion on -/// the traits which `OsString` implements for [conversions] from/to native representations. +/// OS-specific traits which `OsString` and `OsStr` implement for [conversions] from/to native +/// representations. /// /// [`CStr`]: crate::ffi::CStr /// [conversions]: super#conversions @@ -163,6 +178,24 @@ impl OsString { self.inner.into_string().map_err(|buf| OsString { inner: buf }) } + /// Converts the `OsString` into a `Vec`. + /// + /// # Examples + /// + /// ``` + /// #![feature(osstr_bytes)] + /// use std::ffi::OsString; + /// + /// let os_string = OsString::from("foo"); + /// let v = os_string.into_vec(); + /// assert_eq!(v, b"foo"); + /// ``` + #[unstable(feature = "osstr_bytes", issue = "none")] + #[inline] + pub fn into_vec(self) -> Vec { + self.inner.into_vec() + } + /// Extends the string with the given &[OsStr] slice. /// /// # Examples @@ -667,6 +700,23 @@ impl OsStr { self.inner.to_str() } + /// Converts the `OsStr` into a `&[u8]`. + /// + /// # Examples + /// + /// ``` + /// #![feature(osstr_bytes)] + /// use std::ffi::OsStr; + /// + /// let os_str = OsStr::new("foo"); + /// assert_eq!(os_str.as_bytes(), b"foo"); + /// ``` + #[unstable(feature = "osstr_bytes", issue = "none")] + #[inline] + pub fn as_bytes(&self) -> &[u8] { + self.inner.as_u8_slice() + } + /// Converts an `OsStr` to a [Cow]<[str]>. /// /// Any non-Unicode sequences are replaced with diff --git a/library/std/src/os/unix/ffi/mod.rs b/library/std/src/os/unix/ffi/mod.rs index 5b49f50763d74..357a89248ce7a 100644 --- a/library/std/src/os/unix/ffi/mod.rs +++ b/library/std/src/os/unix/ffi/mod.rs @@ -11,10 +11,6 @@ //! // OsStringExt::from_vec //! let os_string = OsString::from_vec(bytes); //! assert_eq!(os_string.to_str(), Some("foo")); -//! -//! // OsStringExt::into_vec -//! let bytes = os_string.into_vec(); -//! assert_eq!(bytes, b"foo"); //! ``` //! //! ``` @@ -26,10 +22,6 @@ //! // OsStrExt::from_bytes //! let os_str = OsStr::from_bytes(bytes); //! assert_eq!(os_str.to_str(), Some("foo")); -//! -//! // OsStrExt::as_bytes -//! let bytes = os_str.as_bytes(); -//! assert_eq!(bytes, b"foo"); //! ``` //! //! [`std::ffi`]: crate::ffi diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs index ccbc182240cf3..09c2557086ea3 100644 --- a/library/std/src/sys/unix/os_str.rs +++ b/library/std/src/sys/unix/os_str.rs @@ -182,6 +182,11 @@ impl Buf { pub fn into_rc(&self) -> Rc { self.as_slice().into_rc() } + + #[inline] + pub fn into_vec(self) -> Vec { + self.inner + } } impl Slice { @@ -190,6 +195,11 @@ impl Slice { unsafe { mem::transmute(s) } } + #[inline] + pub fn as_u8_slice(&self) -> &[u8] { + unsafe { mem::transmute(self) } + } + #[inline] pub fn from_str(s: &str) -> &Slice { Slice::from_u8_slice(s.as_bytes()) diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs index 78e92a3331a1c..c11d8730b6c72 100644 --- a/library/std/src/sys/windows/os_str.rs +++ b/library/std/src/sys/windows/os_str.rs @@ -146,6 +146,11 @@ impl Buf { pub fn into_rc(&self) -> Rc { self.as_slice().into_rc() } + + #[inline] + pub fn into_vec(self) -> Vec { + self.inner.into_vec() + } } impl Slice { @@ -193,6 +198,11 @@ impl Slice { unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Slice) } } + #[inline] + pub fn as_u8_slice(&self) -> &[u8] { + self.inner.as_inner() + } + #[inline] pub fn make_ascii_lowercase(&mut self) { self.inner.make_ascii_lowercase() diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs index 10ef6662115c1..af8c1e3349437 100644 --- a/library/std/src/sys_common/wtf8.rs +++ b/library/std/src/sys_common/wtf8.rs @@ -3,13 +3,6 @@ //! This library uses Rust’s type system to maintain //! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), //! like the `String` and `&str` types do for UTF-8. -//! -//! Since [WTF-8 must not be used -//! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience), -//! this library deliberately does not provide access to the underlying bytes -//! of WTF-8 strings, -//! nor can it decode WTF-8 from arbitrary bytes. -//! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. // this module is imported from @SimonSapin's repo and has tons of dead code on // unix (it's mostly used on windows), so don't worry about dead code here. @@ -399,6 +392,12 @@ impl Wtf8Buf { let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) }; Wtf8Buf { bytes: bytes.into_vec() } } + + /// Converts this `Wtf8Buf` into a `Vec`. + #[inline] + pub fn into_vec(self) -> Vec { + self.bytes + } } /// Creates a new WTF-8 string from an iterator of code points. diff --git a/src/test/ui/env-funky-keys.rs b/src/test/ui/env-funky-keys.rs index 4548d3339472d..04e3e45122c56 100644 --- a/src/test/ui/env-funky-keys.rs +++ b/src/test/ui/env-funky-keys.rs @@ -9,6 +9,7 @@ // no-prefer-dynamic #![feature(rustc_private)] +#![feature(osstr_bytes)] extern crate libc; @@ -16,7 +17,6 @@ use libc::c_char; use libc::execve; use std::env; use std::ffi::CString; -use std::os::unix::prelude::*; use std::ptr; fn main() { From af7b7da80e7a7d691f7376662b34ac4d5e9420d0 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Thu, 24 Mar 2022 22:09:37 -0700 Subject: [PATCH 2/2] Work around rustdoc ICE by dropping some intra-doc links --- library/std/src/ffi/mod.rs | 6 +++--- library/std/src/ffi/os_str.rs | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/library/std/src/ffi/mod.rs b/library/std/src/ffi/mod.rs index d3971334462c0..7fda7497e616c 100644 --- a/library/std/src/ffi/mod.rs +++ b/library/std/src/ffi/mod.rs @@ -105,11 +105,11 @@ //! On all platforms, `OsStr` and `OsString` consist of a sequence of bytes; see [`OsString`] for //! more details on its encoding on different platforms. //! -//! `OsStr` provides the method [`OsStr::as_bytes`], which provides a zero-cost conversion to a -//! byte slice. (`OsString` provides this method as well, along with all other `OsStr` methods, via +//! `OsStr` provides the method `OsStr::as_bytes`, which provides a zero-cost conversion to a byte +//! slice. (`OsString` provides this method as well, along with all other `OsStr` methods, via //! `Deref`.) //! -//! `OsString` provides the method [`OsString::into_vec`], which provides a zero-cost conversion to +//! `OsString` provides the method `OsString::into_vec`, which provides a zero-cost conversion to //! `Vec`. //! //! ## On Unix diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs index e4d0ea58d18db..b287c4053d93d 100644 --- a/library/std/src/ffi/os_str.rs +++ b/library/std/src/ffi/os_str.rs @@ -72,11 +72,11 @@ use crate::sys_common::{AsInner, FromInner, IntoInner}; /// /// # Conversions /// -/// `OsStr` provides the method [`OsStr::as_bytes`], which provides a zero-cost conversion to a -/// byte slice. (`OsString` provides this method as well, along with all other `OsStr` methods, via +/// `OsStr` provides the method `OsStr::as_bytes`, which provides a zero-cost conversion to a byte +/// slice. (`OsString` provides this method as well, along with all other `OsStr` methods, via /// `Deref`.) /// -/// `OsString` provides the method [`OsString::into_vec`], which provides a zero-cost conversion to +/// `OsString` provides the method `OsString::into_vec`, which provides a zero-cost conversion to /// `Vec`. /// /// See the [module's toplevel documentation about conversions][conversions] for a discussion on