Skip to content

Commit

Permalink
Auto merge of rust-lang#135408 - RalfJung:x86-sse2, r=workingjubilee
Browse files Browse the repository at this point in the history
x86: use SSE2 to pass float and SIMD types

This builds on the new X86Sse2 ABI landed in rust-lang#137037 to actually make it a separate ABI from the default x86 ABI, and use SSE2 registers. Specifically, we use it in two ways: to return `f64` values in a register rather than by-ptr, and to pass vectors of size up to 128bit in a register (or, well, whatever LLVM does when passing `<4 x float>` by-val, I don't actually know if this ends up in a register).

Cc `@workingjubilee`
Fixes rust-lang#133611

try-job: aarch64-apple
try-job: aarch64-gnu
try-job: aarch64-gnu-debug
try-job: test-various
try-job: x86_64-gnu-nopt
try-job: dist-i586-gnu-i586-i686-musl
try-job: x86_64-msvc-1
  • Loading branch information
bors committed Feb 19, 2025
2 parents f44efbf + 803feb5 commit 17c1c32
Show file tree
Hide file tree
Showing 14 changed files with 273 additions and 151 deletions.
118 changes: 74 additions & 44 deletions compiler/rustc_target/src/callconv/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use rustc_abi::{
};
use rustc_macros::HashStable_Generic;

use crate::spec::{HasTargetSpec, HasWasmCAbiOpt, HasX86AbiOpt, WasmCAbi};
use crate::spec::{HasTargetSpec, HasWasmCAbiOpt, HasX86AbiOpt, RustcAbi, WasmCAbi};

mod aarch64;
mod amdgpu;
Expand Down Expand Up @@ -386,6 +386,7 @@ impl<'a, Ty> ArgAbi<'a, Ty> {
/// Pass this argument directly instead. Should NOT be used!
/// Only exists because of past ABI mistakes that will take time to fix
/// (see <https://github.com/rust-lang/rust/issues/115666>).
#[track_caller]
pub fn make_direct_deprecated(&mut self) {
match self.mode {
PassMode::Indirect { .. } => {
Expand All @@ -398,6 +399,7 @@ impl<'a, Ty> ArgAbi<'a, Ty> {

/// Pass this argument indirectly, by passing a (thin or wide) pointer to the argument instead.
/// This is valid for both sized and unsized arguments.
#[track_caller]
pub fn make_indirect(&mut self) {
match self.mode {
PassMode::Direct(_) | PassMode::Pair(_, _) => {
Expand All @@ -412,6 +414,7 @@ impl<'a, Ty> ArgAbi<'a, Ty> {

/// Same as `make_indirect`, but for arguments that are ignored. Only needed for ABIs that pass
/// ZSTs indirectly.
#[track_caller]
pub fn make_indirect_from_ignore(&mut self) {
match self.mode {
PassMode::Ignore => {
Expand Down Expand Up @@ -716,27 +719,46 @@ impl<'a, Ty> FnAbi<'a, Ty> {
C: HasDataLayout + HasTargetSpec,
{
let spec = cx.target_spec();
match &spec.arch[..] {
match &*spec.arch {
"x86" => x86::compute_rust_abi_info(cx, self, abi),
"riscv32" | "riscv64" => riscv::compute_rust_abi_info(cx, self, abi),
"loongarch64" => loongarch::compute_rust_abi_info(cx, self, abi),
"aarch64" => aarch64::compute_rust_abi_info(cx, self),
_ => {}
};

// Decides whether we can pass the given SIMD argument via `PassMode::Direct`.
// May only return `true` if the target will always pass those arguments the same way,
// no matter what the user does with `-Ctarget-feature`! In other words, whatever
// target features are required to pass a SIMD value in registers must be listed in
// the `abi_required_features` for the current target and ABI.
let can_pass_simd_directly = |arg: &ArgAbi<'_, Ty>| match &*spec.arch {
// On x86, if we have SSE2 (which we have by default for x86_64), we can always pass up
// to 128-bit-sized vectors.
"x86" if spec.rustc_abi == Some(RustcAbi::X86Sse2) => arg.layout.size.bits() <= 128,
"x86_64" if spec.rustc_abi != Some(RustcAbi::X86Softfloat) => {
arg.layout.size.bits() <= 128
}
// So far, we haven't implemented this logic for any other target.
_ => false,
};

for (arg_idx, arg) in self
.args
.iter_mut()
.enumerate()
.map(|(idx, arg)| (Some(idx), arg))
.chain(iter::once((None, &mut self.ret)))
{
if arg.is_ignore() {
// If the logic above already picked a specific type to cast the argument to, leave that
// in place.
if matches!(arg.mode, PassMode::Ignore | PassMode::Cast { .. }) {
continue;
}

if arg_idx.is_none()
&& arg.layout.size > Primitive::Pointer(AddressSpace::DATA).size(cx) * 2
&& !matches!(arg.layout.backend_repr, BackendRepr::Vector { .. })
{
// Return values larger than 2 registers using a return area
// pointer. LLVM and Cranelift disagree about how to return
Expand All @@ -746,7 +768,8 @@ impl<'a, Ty> FnAbi<'a, Ty> {
// return value independently and decide to pass it in a
// register or not, which would result in the return value
// being passed partially in registers and partially through a
// return area pointer.
// return area pointer. For large IR-level values such as `i128`,
// cranelift will even split up the value into smaller chunks.
//
// While Cranelift may need to be fixed as the LLVM behavior is
// generally more correct with respect to the surface language,
Expand Down Expand Up @@ -776,53 +799,60 @@ impl<'a, Ty> FnAbi<'a, Ty> {
// rustc_target already ensure any return value which doesn't
// fit in the available amount of return registers is passed in
// the right way for the current target.
//
// The adjustment is not necessary nor desired for types with a vector
// representation; those are handled below.
arg.make_indirect();
continue;
}

match arg.layout.backend_repr {
BackendRepr::Memory { .. } => {}

// This is a fun case! The gist of what this is doing is
// that we want callers and callees to always agree on the
// ABI of how they pass SIMD arguments. If we were to *not*
// make these arguments indirect then they'd be immediates
// in LLVM, which means that they'd used whatever the
// appropriate ABI is for the callee and the caller. That
// means, for example, if the caller doesn't have AVX
// enabled but the callee does, then passing an AVX argument
// across this boundary would cause corrupt data to show up.
//
// This problem is fixed by unconditionally passing SIMD
// arguments through memory between callers and callees
// which should get them all to agree on ABI regardless of
// target feature sets. Some more information about this
// issue can be found in #44367.
//
// Note that the intrinsic ABI is exempt here as
// that's how we connect up to LLVM and it's unstable
// anyway, we control all calls to it in libstd.
BackendRepr::Vector { .. }
if abi != ExternAbi::RustIntrinsic && spec.simd_types_indirect =>
{
arg.make_indirect();
continue;
BackendRepr::Memory { .. } => {
// Compute `Aggregate` ABI.

let is_indirect_not_on_stack =
matches!(arg.mode, PassMode::Indirect { on_stack: false, .. });
assert!(is_indirect_not_on_stack);

let size = arg.layout.size;
if arg.layout.is_sized()
&& size <= Primitive::Pointer(AddressSpace::DATA).size(cx)
{
// We want to pass small aggregates as immediates, but using
// an LLVM aggregate type for this leads to bad optimizations,
// so we pick an appropriately sized integer type instead.
arg.cast_to(Reg { kind: RegKind::Integer, size });
}
}

_ => continue,
}
// Compute `Aggregate` ABI.

let is_indirect_not_on_stack =
matches!(arg.mode, PassMode::Indirect { on_stack: false, .. });
assert!(is_indirect_not_on_stack);

let size = arg.layout.size;
if !arg.layout.is_unsized() && size <= Primitive::Pointer(AddressSpace::DATA).size(cx) {
// We want to pass small aggregates as immediates, but using
// an LLVM aggregate type for this leads to bad optimizations,
// so we pick an appropriately sized integer type instead.
arg.cast_to(Reg { kind: RegKind::Integer, size });
BackendRepr::Vector { .. } => {
// This is a fun case! The gist of what this is doing is
// that we want callers and callees to always agree on the
// ABI of how they pass SIMD arguments. If we were to *not*
// make these arguments indirect then they'd be immediates
// in LLVM, which means that they'd used whatever the
// appropriate ABI is for the callee and the caller. That
// means, for example, if the caller doesn't have AVX
// enabled but the callee does, then passing an AVX argument
// across this boundary would cause corrupt data to show up.
//
// This problem is fixed by unconditionally passing SIMD
// arguments through memory between callers and callees
// which should get them all to agree on ABI regardless of
// target feature sets. Some more information about this
// issue can be found in #44367.
//
// Note that the intrinsic ABI is exempt here as those are not
// real functions anyway, and the backend expects very specific types.
if abi != ExternAbi::RustIntrinsic
&& spec.simd_types_indirect
&& !can_pass_simd_directly(arg)
{
arg.make_indirect();
}
}

_ => {}
}
}
}
Expand Down
14 changes: 11 additions & 3 deletions compiler/rustc_target/src/callconv/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use rustc_abi::{
};

use crate::callconv::{ArgAttribute, FnAbi, PassMode};
use crate::spec::HasTargetSpec;
use crate::spec::{HasTargetSpec, RustcAbi};

#[derive(PartialEq)]
pub(crate) enum Flavor {
Expand Down Expand Up @@ -236,8 +236,16 @@ where
_ => false, // anyway not passed via registers on x86
};
if has_float {
if fn_abi.ret.layout.size <= Primitive::Pointer(AddressSpace::DATA).size(cx) {
// Same size or smaller than pointer, return in a register.
if cx.target_spec().rustc_abi == Some(RustcAbi::X86Sse2)
&& fn_abi.ret.layout.backend_repr.is_scalar()
&& fn_abi.ret.layout.size.bits() <= 128
{
// This is a single scalar that fits into an SSE register, and the target uses the
// SSE ABI. We prefer this over integer registers as float scalars need to be in SSE
// registers for float operations, so that's the best place to pass them around.
fn_abi.ret.cast_to(Reg { kind: RegKind::Vector, size: fn_abi.ret.layout.size });
} else if fn_abi.ret.layout.size <= Primitive::Pointer(AddressSpace::DATA).size(cx) {
// Same size or smaller than pointer, return in an integer register.
fn_abi.ret.cast_to(Reg { kind: RegKind::Integer, size: fn_abi.ret.layout.size });
} else {
// Larger than a pointer, return indirectly.
Expand Down
7 changes: 4 additions & 3 deletions tests/assembly/closure-inherit-target-feature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

use std::arch::x86_64::{__m128, _mm_blend_ps};

// Use an explicit return pointer to prevent tail call optimization.
#[no_mangle]
pub unsafe fn sse41_blend_nofeature(x: __m128, y: __m128) -> __m128 {
pub unsafe fn sse41_blend_nofeature(x: __m128, y: __m128, ret: *mut __m128) {
let f = {
// check that _mm_blend_ps is not being inlined into the closure
// CHECK-LABEL: {{sse41_blend_nofeature.*closure.*:}}
Expand All @@ -18,9 +19,9 @@ pub unsafe fn sse41_blend_nofeature(x: __m128, y: __m128) -> __m128 {
// CHECK-NOT: blendps
// CHECK: ret
#[inline(never)]
|x, y| _mm_blend_ps(x, y, 0b0101)
|x, y, ret: *mut __m128| unsafe { *ret = _mm_blend_ps(x, y, 0b0101) }
};
f(x, y)
f(x, y, ret);
}

#[no_mangle]
Expand Down
42 changes: 18 additions & 24 deletions tests/assembly/x86-return-float.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,18 @@ use minicore::*;
// CHECK-LABEL: return_f32:
#[no_mangle]
pub fn return_f32(x: f32) -> f32 {
// CHECK: movl {{.*}}(%ebp), %eax
// CHECK-NOT: ax
// CHECK: retl
// CHECK: movss {{.*}}(%ebp), %xmm0
// CHECK-NEXT: popl %ebp
// CHECK-NEXT: retl
x
}

// CHECK-LABEL: return_f64:
#[no_mangle]
pub fn return_f64(x: f64) -> f64 {
// CHECK: movl [[#%d,OFFSET:]](%ebp), %[[PTR:.*]]
// CHECK-NEXT: movsd [[#%d,OFFSET+4]](%ebp), %[[VAL:.*]]
// CHECK-NEXT: movsd %[[VAL]], (%[[PTR]])
// CHECK: retl
// CHECK: movsd {{.*}}(%ebp), %xmm0
// CHECK-NEXT: popl %ebp
// CHECK-NEXT: retl
x
}

Expand Down Expand Up @@ -157,7 +156,7 @@ pub unsafe fn call_f32(x: &mut f32) {
}
// CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
// CHECK: calll {{()|_}}get_f32
// CHECK-NEXT: movl %eax, (%[[PTR]])
// CHECK-NEXT: movss %xmm0, (%[[PTR]])
*x = get_f32();
}

Expand All @@ -169,8 +168,7 @@ pub unsafe fn call_f64(x: &mut f64) {
}
// CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
// CHECK: calll {{()|_}}get_f64
// CHECK: movsd {{.*}}(%{{ebp|esp}}), %[[VAL:.*]]
// CHECK-NEXT: movsd %[[VAL:.*]], (%[[PTR]])
// CHECK-NEXT: movlps %xmm0, (%[[PTR]])
*x = get_f64();
}

Expand Down Expand Up @@ -315,25 +313,21 @@ pub unsafe fn call_other_f64(x: &mut (usize, f64)) {
#[no_mangle]
pub fn return_f16(x: f16) -> f16 {
// CHECK: pushl %ebp
// CHECK: movl %esp, %ebp
// CHECK: movzwl 8(%ebp), %eax
// CHECK: popl %ebp
// CHECK: retl
// CHECK-NEXT: movl %esp, %ebp
// CHECK-NEXT: pinsrw $0, 8(%ebp), %xmm0
// CHECK-NEXT: popl %ebp
// CHECK-NEXT: retl
x
}

// CHECK-LABEL: return_f128:
#[no_mangle]
pub fn return_f128(x: f128) -> f128 {
// CHECK: movl [[#%d,OFFSET:]](%ebp), %[[PTR:.*]]
// CHECK-NEXT: movl [[#%d,OFFSET+4]](%ebp), %[[VAL1:.*]]
// CHECK-NEXT: movl [[#%d,OFFSET+8]](%ebp), %[[VAL2:.*]]
// CHECK-NEXT: movl [[#%d,OFFSET+12]](%ebp), %[[VAL3:.*]]
// CHECK-NEXT: movl [[#%d,OFFSET+16]](%ebp), %[[VAL4:.*]]
// CHECK-NEXT: movl %[[VAL4:.*]] 12(%[[PTR]])
// CHECK-NEXT: movl %[[VAL3:.*]] 8(%[[PTR]])
// CHECK-NEXT: movl %[[VAL2:.*]] 4(%[[PTR]])
// CHECK-NEXT: movl %[[VAL1:.*]] (%[[PTR]])
// CHECK: retl
// CHECK: pushl %ebp
// CHECK-NEXT: movl %esp, %ebp
// linux-NEXT: movaps 8(%ebp), %xmm0
// win-NEXT: movups 8(%ebp), %xmm0
// CHECK-NEXT: popl %ebp
// CHECK-NEXT: retl
x
}
36 changes: 36 additions & 0 deletions tests/codegen/abi-x86-sse.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
//@ compile-flags: -Z merge-functions=disabled

//@ revisions: x86-64
//@[x86-64] compile-flags: --target x86_64-unknown-linux-gnu
//@[x86-64] needs-llvm-components: x86

//@ revisions: x86-32
//@[x86-32] compile-flags: --target i686-unknown-linux-gnu
//@[x86-32] needs-llvm-components: x86

//@ revisions: x86-32-nosse
//@[x86-32-nosse] compile-flags: --target i586-unknown-linux-gnu
//@[x86-32-nosse] needs-llvm-components: x86

#![feature(no_core, lang_items, rustc_attrs, repr_simd)]
#![no_core]
#![crate_type = "lib"]

#[lang = "sized"]
trait Sized {}

#[lang = "copy"]
trait Copy {}

// Ensure this type is passed without ptr indirection on targets that
// require SSE2.
#[repr(simd)]
pub struct Sse([f32; 4]);

// x86-64: <4 x float> @sse_id(<4 x float> {{[^,]*}})
// x86-32: <4 x float> @sse_id(<4 x float> {{[^,]*}})
// x86-32-nosse: void @sse_id(ptr{{( [^,]*)?}} sret([16 x i8]){{( .*)?}}, ptr{{( [^,]*)?}})
#[no_mangle]
pub fn sse_id(x: Sse) -> Sse {
x
}
Loading

0 comments on commit 17c1c32

Please sign in to comment.