From 7f17639803fdc73f9ab9bd60a315596ea8881af9 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 30 Jul 2024 14:02:51 -0700 Subject: [PATCH 1/4] [SLP]Fix PR101213: Reuse extractelement, only if its vector operand comes before new vector value. When trying to reuse extractelement instruction, need to check that it is inserted into proper position. Its original vector operand should come before new vector value, otherwise new extractelement instruction must be generated. Fixes https://github.com/llvm/llvm-project/issues/101213 (cherry picked from commit f70f1228035c9610de38e0e376afdacb647c4ad9) --- .../Transforms/Vectorize/SLPVectorizer.cpp | 9 +++- .../X86/extract-vectorized-operand.ll | 49 +++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ab2b96cdc42db..007ffedab7df4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -13889,11 +13889,16 @@ Value *BoUpSLP::vectorizeTree( } if (!Ex) { // "Reuse" the existing extract to improve final codegen. - if (auto *ES = dyn_cast(Scalar)) { + if (auto *ES = dyn_cast(Scalar); + ES && isa(Vec)) { Value *V = ES->getVectorOperand(); if (const TreeEntry *ETE = getTreeEntry(V)) V = ETE->VectorizedValue; - Ex = Builder.CreateExtractElement(V, ES->getIndexOperand()); + if (auto *IV = dyn_cast(V); + !IV || IV == Vec || IV->comesBefore(cast(Vec))) + Ex = Builder.CreateExtractElement(V, ES->getIndexOperand()); + else + Ex = Builder.CreateExtractElement(Vec, Lane); } else if (ReplaceGEP) { // Leave the GEPs as is, they are free in most cases and better to // keep them as GEPs. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll new file mode 100644 index 0000000000000..f1a5709d07f02 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -slp-threshold=-99999 < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: br label %[[BB43:.*]] +; CHECK: [[BB20:.*]]: +; CHECK-NEXT: br label %[[BB105:.*]] +; CHECK: [[BB43]]: +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x ptr addrspace(1)> [ [[TMP1:%.*]], %[[BB51:.*]] ], [ zeroinitializer, %[[BB]] ] +; CHECK-NEXT: br i1 false, label %[[BB105]], label %[[BB51]] +; CHECK: [[BB51]]: +; CHECK-NEXT: [[TMP1]] = phi <2 x ptr addrspace(1)> [ poison, %[[BB54:.*]] ], [ zeroinitializer, %[[BB43]] ] +; CHECK-NEXT: br label %[[BB43]] +; CHECK: [[BB54]]: +; CHECK-NEXT: br label %[[BB51]] +; CHECK: [[BB105]]: +; CHECK-NEXT: [[PHI106:%.*]] = phi ptr addrspace(1) [ null, %[[BB20]] ], [ null, %[[BB43]] ] +; CHECK-NEXT: ret void +; +bb: + %0 = shufflevector <2 x ptr addrspace(1)> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer, <2 x i32> + %1 = extractelement <2 x ptr addrspace(1)> %0, i32 0 + %2 = extractelement <2 x ptr addrspace(1)> %0, i32 1 + br label %bb43 + +bb20: + br label %bb105 + +bb43: + %phi441 = phi ptr addrspace(1) [ %4, %bb51 ], [ %2, %bb ] + %phi452 = phi ptr addrspace(1) [ %5, %bb51 ], [ %1, %bb ] + br i1 false, label %bb105, label %bb51 + +bb51: + %3 = phi <2 x ptr addrspace(1)> [ poison, %bb54 ], [ zeroinitializer, %bb43 ] + %4 = extractelement <2 x ptr addrspace(1)> %3, i32 0 + %5 = extractelement <2 x ptr addrspace(1)> %3, i32 1 + br label %bb43 + +bb54: + br label %bb51 + +bb105: + %phi106 = phi ptr addrspace(1) [ %1, %bb20 ], [ null, %bb43 ] + ret void +} + From 94c2702791347a15f293da041685ca7facc267a5 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 16 Oct 2024 00:48:43 -0400 Subject: [PATCH 2/4] [JITLink] Fix i686 R_386_32 and other relocation values (#111091) Fix R_386_32 and other relocations by correcting Addend computations. (cherry picked from commit 5716f836d25e93bf8f664a14fe55c70e07a369be) --- .../llvm/ExecutionEngine/JITLink/i386.h | 33 +++++-------------- llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp | 20 +++++++++-- .../i386/ELF_i386_absolute_relocations_16.s | 10 +++++- .../i386/ELF_i386_absolute_relocations_32.s | 16 ++++++--- .../ELF_i386_pc_relative_relocations_32.s | 7 ++-- .../i386/ELF_i386_small_pic_relocations_got.s | 12 +++---- .../i386/ELF_i386_small_pic_relocations_plt.s | 6 ++-- 7 files changed, 60 insertions(+), 44 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/i386.h b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h index f8d24d8bf31ca..efe8182934dd7 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/i386.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h @@ -39,12 +39,8 @@ enum EdgeKind_i386 : Edge::Kind { /// Represents a data/control flow instruction using PC-relative addressing /// to a target. /// - /// The fixup expression for this kind includes an implicit offset to account - /// for the PC (unlike the Delta edges) so that a PCRel32 with a target - /// T and addend zero is a call/branch to the start (offset zero) of T. - /// /// Fixup expression: - /// Fixup <- Target - (Fixup + 4) + Addend : int32 + /// Fixup <- Target - Fixup + Addend : int32 /// /// Errors: /// - The result of the fixup expression must fit into an int32, otherwise @@ -68,12 +64,8 @@ enum EdgeKind_i386 : Edge::Kind { /// Represents a data/control flow instruction using PC-relative addressing /// to a target. /// - /// The fixup expression for this kind includes an implicit offset to account - /// for the PC (unlike the Delta edges) so that a PCRel16 with a target - /// T and addend zero is a call/branch to the start (offset zero) of T. - /// /// Fixup expression: - /// Fixup <- Target - (Fixup + 4) + Addend : int16 + /// Fixup <- Target - Fixup + Addend : int16 /// /// Errors: /// - The result of the fixup expression must fit into an int16, otherwise @@ -86,7 +78,7 @@ enum EdgeKind_i386 : Edge::Kind { /// Delta from the fixup to the target. /// /// Fixup expression: - /// Fixup <- Target - Fixup + Addend : int64 + /// Fixup <- Target - Fixup + Addend : int32 /// /// Errors: /// - The result of the fixup expression must fit into an int32, otherwise @@ -130,12 +122,8 @@ enum EdgeKind_i386 : Edge::Kind { /// Represents a PC-relative call or branch to a target. This can be used to /// identify, record, and/or patch call sites. /// - /// The fixup expression for this kind includes an implicit offset to account - /// for the PC (unlike the Delta edges) so that a Branch32PCRel with a target - /// T and addend zero is a call/branch to the start (offset zero) of T. - /// /// Fixup expression: - /// Fixup <- Target - (Fixup + 4) + Addend : int32 + /// Fixup <- Target - Fixup + Addend : int32 /// /// Errors: /// - The result of the fixup expression must fit into an int32, otherwise @@ -164,7 +152,7 @@ enum EdgeKind_i386 : Edge::Kind { /// target may be recorded to allow manipulation at runtime. /// /// Fixup expression: - /// Fixup <- Target - Fixup + Addend - 4 : int32 + /// Fixup <- Target - Fixup + Addend : int32 /// /// Errors: /// - The result of the fixup expression must fit into an int32, otherwise @@ -180,7 +168,7 @@ enum EdgeKind_i386 : Edge::Kind { /// is within range of the fixup location. /// /// Fixup expression: - /// Fixup <- Target - Fixup + Addend - 4: int32 + /// Fixup <- Target - Fixup + Addend : int32 /// /// Errors: /// - The result of the fixup expression must fit into an int32, otherwise @@ -215,8 +203,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E, } case i386::PCRel32: { - int32_t Value = - E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend(); + int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend(); *(little32_t *)FixupPtr = Value; break; } @@ -231,8 +218,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E, } case i386::PCRel16: { - int32_t Value = - E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend(); + int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend(); if (LLVM_LIKELY(isInt<16>(Value))) *(little16_t *)FixupPtr = Value; else @@ -257,8 +243,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E, case i386::BranchPCRel32: case i386::BranchPCRel32ToPtrJumpStub: case i386::BranchPCRel32ToPtrJumpStubBypassable: { - int32_t Value = - E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend(); + int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend(); *(little32_t *)FixupPtr = Value; break; } diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp index 860165365a7e4..2d5f28cad1cc6 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp @@ -186,15 +186,29 @@ class ELFLinkGraphBuilder_i386 : public ELFLinkGraphBuilder { int64_t Addend = 0; switch (*Kind) { - case i386::EdgeKind_i386::Delta32: { + case i386::EdgeKind_i386::None: + break; + case i386::EdgeKind_i386::Pointer32: + case i386::EdgeKind_i386::PCRel32: + case i386::EdgeKind_i386::RequestGOTAndTransformToDelta32FromGOT: + case i386::EdgeKind_i386::Delta32: + case i386::EdgeKind_i386::Delta32FromGOT: + case i386::EdgeKind_i386::BranchPCRel32: + case i386::EdgeKind_i386::BranchPCRel32ToPtrJumpStub: + case i386::EdgeKind_i386::BranchPCRel32ToPtrJumpStubBypassable: { const char *FixupContent = BlockToFix.getContent().data() + (FixupAddress - BlockToFix.getAddress()); - Addend = *(const support::ulittle32_t *)FixupContent; + Addend = *(const support::little32_t *)FixupContent; break; } - default: + case i386::EdgeKind_i386::Pointer16: + case i386::EdgeKind_i386::PCRel16: { + const char *FixupContent = BlockToFix.getContent().data() + + (FixupAddress - BlockToFix.getAddress()); + Addend = *(const support::little16_t *)FixupContent; break; } + } Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress(); Edge GE(*Kind, Offset, *GraphSymbol, Addend); diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_16.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_16.s index 47142c4be3c09..092f7d753c7ea 100644 --- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_16.s +++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_16.s @@ -22,4 +22,12 @@ main: .type bar,@function bar: retw $external_data - .size bar, .-bar \ No newline at end of file + .size bar, .-bar + +# jitlink-check: decode_operand(baz, 0) = external_data + 23 + .globl baz + .align 2, 0x90 + .type baz,@function +baz: + retw $external_data+23 + .size baz, .-baz diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_32.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_32.s index e4b02a794bbc4..a66ad8e7cda67 100644 --- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_32.s +++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_32.s @@ -7,17 +7,25 @@ # Test ELF 32 bit absolute relocations .text - .globl main + .globl main .p2align 4, 0x90 .type main,@function -main: +main: retl .size main, .-main # jitlink-check: decode_operand(foo, 0) = external_data - .globl foo + .globl foo .p2align 4, 0x90 .type foo,@function foo: movl external_data, %eax - .size foo, .-foo \ No newline at end of file + .size foo, .-foo + +# jitlink-check: decode_operand(bar, 0) = external_data + 4000 + .globl bar + .p2align 4, 0x90 + .type bar,@function +bar: + movl external_data + 4000, %eax + .size bar, .-bar diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_pc_relative_relocations_32.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_pc_relative_relocations_32.s index df74c7bb39324..0717c8f434d53 100644 --- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_pc_relative_relocations_32.s +++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_pc_relative_relocations_32.s @@ -33,11 +33,12 @@ foo: # Tests PC relative relocation for negative offset from PC -# jitlink-check: decode_operand(baz, 0) = fooz - next_pc(baz) +# jitlink-check: decode_operand(baz, 0) = fooz - next_pc(baz) + 1 .globl fooz .p2align 4 .type fooz,@function fooz: + nop retl .size fooz, .-fooz @@ -45,5 +46,5 @@ fooz: .p2align 4 .type baz,@function baz: - calll fooz - .size baz, .-baz \ No newline at end of file + calll fooz+1 + .size baz, .-baz diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s index 91049a8a87a55..080341ac3bfed 100644 --- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s +++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s @@ -19,29 +19,29 @@ main: # Test GOT32 handling. # # We want to check both the offset to the GOT entry and its contents. -# jitlink-check: decode_operand(test_got, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data1) - _GLOBAL_OFFSET_TABLE_ +# jitlink-check: decode_operand(test_got, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data1) - _GLOBAL_OFFSET_TABLE_ + 42 # jitlink-check: *{4}(got_addr(elf_sm_pic_reloc_got.o, named_data1)) = named_data1 # -# jitlink-check: decode_operand(test_got+6, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data2) - _GLOBAL_OFFSET_TABLE_ +# jitlink-check: decode_operand(test_got+6, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data2) - _GLOBAL_OFFSET_TABLE_ + 5 # jitlink-check: *{4}(got_addr(elf_sm_pic_reloc_got.o, named_data2)) = named_data2 .globl test_got .p2align 4, 0x90 .type test_got,@function test_got: - leal named_data1@GOT, %eax - leal named_data2@GOT, %eax + leal named_data1@GOT+42, %eax + leal named_data2@GOT+5, %eax .size test_got, .-test_got # Test GOTOFF64 handling. -# jitlink-check: decode_operand(test_gotoff, 1) = named_func - _GLOBAL_OFFSET_TABLE_ +# jitlink-check: decode_operand(test_gotoff, 1) = named_func - _GLOBAL_OFFSET_TABLE_ + 99 .globl test_gotoff .p2align 4, 0x90 .type test_gotoff,@function test_gotoff: - mov $named_func@GOTOFF, %eax + mov $named_func@GOTOFF+99, %eax .size test_gotoff, .-test_gotoff diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_plt.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_plt.s index e5725a2b52c30..ce565ca2fcdda 100644 --- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_plt.s +++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_plt.s @@ -27,12 +27,12 @@ main: # for position independent code, first, as there may be future use-cases # where we would want to disable the optimization. # -# jitlink-check: decode_operand(test_call_extern_plt, 0) = external_func - next_pc(test_call_extern_plt) +# jitlink-check: decode_operand(test_call_extern_plt, 0) = external_func - next_pc(test_call_extern_plt) + 53 # jitlink-check: *{4}(got_addr(elf_sm_pic_reloc_plt.o, external_func))= external_func .globl test_call_extern_plt .p2align 4, 0x90 .type test_call_extern_plt,@function test_call_extern_plt: - call external_func@plt + call external_func@plt + 53 - .size test_call_extern_plt, .-test_call_extern_plt \ No newline at end of file + .size test_call_extern_plt, .-test_call_extern_plt From af93c5a615403d617f016a2abaf33e6c3dfb4344 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Thu, 29 Aug 2024 15:17:40 +0200 Subject: [PATCH 3/4] [X86] Prefer `lock or` over mfence (llvm/llvm-project#106555) Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction. (cherry picked from commit 4d502dd7dd8c505775763bd783bb33678bff9e63) (cherry picked from commit 707ca0e44f57bc2235d8ea29376ef45de9a1adb8) --- llvm/lib/Target/X86/X86.td | 52 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 19 +- llvm/test/CodeGen/X86/atomic-idempotent.ll | 86 +-- llvm/test/CodeGen/X86/atomic-unordered.ll | 765 ++++++++++++++++++++- llvm/test/CodeGen/X86/mfence.ll | 32 + 5 files changed, 847 insertions(+), 107 deletions(-) diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index e82e624f70997..729e151d255a8 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -748,6 +748,10 @@ def TuningUseGLMDivSqrtCosts def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true", "Target has branch hint feature">; +def TuningAvoidMFENCE + : SubtargetFeature<"avoid-mfence", "AvoidMFence", "true", + "Avoid MFENCE for fence seq_cst, and instead use lock or">; + //===----------------------------------------------------------------------===// // X86 CPU Families // TODO: Remove these - use general tuning features to determine codegen. @@ -809,7 +813,8 @@ def ProcessorFeatures { TuningSlow3OpsLEA, TuningSlowDivide64, TuningSlowIncDec, - TuningInsertVZEROUPPER + TuningInsertVZEROUPPER, + TuningAvoidMFENCE ]; list X86_64V2Features = !listconcat(X86_64V1Features, [ @@ -825,7 +830,8 @@ def ProcessorFeatures { TuningFastSHLDRotate, TuningFast15ByteNOP, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER + TuningInsertVZEROUPPER, + TuningAvoidMFENCE ]; list X86_64V3Features = !listconcat(X86_64V2Features, [ @@ -844,7 +850,8 @@ def ProcessorFeatures { TuningPOPCNTFalseDeps, TuningLZCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit + TuningAllowLight256Bit, + TuningAvoidMFENCE ]; list X86_64V4Features = !listconcat(X86_64V3Features, [ @@ -868,7 +875,8 @@ def ProcessorFeatures { TuningFastGather, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit + TuningAllowLight256Bit, + TuningAvoidMFENCE ]; // Nehalem @@ -876,7 +884,8 @@ def ProcessorFeatures { list NHMTuning = [TuningMacroFusion, TuningSlowDivide64, TuningInsertVZEROUPPER, - TuningNoDomainDelayMov]; + TuningNoDomainDelayMov, + TuningAvoidMFENCE]; // Westmere list WSMAdditionalFeatures = [FeaturePCLMUL]; @@ -897,7 +906,8 @@ def ProcessorFeatures { TuningFast15ByteNOP, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningNoDomainDelayMov]; + TuningNoDomainDelayMov, + TuningAvoidMFENCE]; list SNBFeatures = !listconcat(WSMFeatures, SNBAdditionalFeatures); @@ -963,7 +973,8 @@ def ProcessorFeatures { TuningAllowLight256Bit, TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, - TuningNoDomainDelayBlend]; + TuningNoDomainDelayBlend, + TuningAvoidMFENCE]; list SKLFeatures = !listconcat(BDWFeatures, SKLAdditionalFeatures); @@ -998,7 +1009,8 @@ def ProcessorFeatures { TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, TuningNoDomainDelayBlend, - TuningFastImmVectorShift]; + TuningFastImmVectorShift, + TuningAvoidMFENCE]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); @@ -1041,7 +1053,8 @@ def ProcessorFeatures { TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, TuningNoDomainDelayBlend, - TuningFastImmVectorShift]; + TuningFastImmVectorShift, + TuningAvoidMFENCE]; list CNLFeatures = !listconcat(SKLFeatures, CNLAdditionalFeatures); @@ -1070,7 +1083,8 @@ def ProcessorFeatures { TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, TuningNoDomainDelayBlend, - TuningFastImmVectorShift]; + TuningFastImmVectorShift, + TuningAvoidMFENCE]; list ICLFeatures = !listconcat(CNLFeatures, ICLAdditionalFeatures); @@ -1216,7 +1230,8 @@ def ProcessorFeatures { // Tremont list TRMAdditionalFeatures = [FeatureCLWB, FeatureGFNI]; - list TRMTuning = GLPTuning; + list TRMAdditionalTuning = [TuningAvoidMFENCE]; + list TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning); list TRMFeatures = !listconcat(GLPFeatures, TRMAdditionalFeatures); @@ -1394,7 +1409,8 @@ def ProcessorFeatures { TuningFastImm16, TuningSBBDepBreaking, TuningSlowDivide64, - TuningSlowSHLD]; + TuningSlowSHLD, + TuningAvoidMFENCE]; list BtVer2Features = !listconcat(BtVer1Features, BtVer2AdditionalFeatures); @@ -1423,7 +1439,8 @@ def ProcessorFeatures { TuningFastScalarShiftMasks, TuningBranchFusion, TuningSBBDepBreaking, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAvoidMFENCE]; // PileDriver list BdVer2AdditionalFeatures = [FeatureF16C, @@ -1503,7 +1520,8 @@ def ProcessorFeatures { TuningSlowSHLD, TuningSBBDepBreaking, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningAvoidMFENCE]; list ZN2AdditionalFeatures = [FeatureCLWB, FeatureRDPID, FeatureRDPRU, @@ -1691,7 +1709,8 @@ def : ProcModel; } foreach P = ["penryn", "core_2_duo_sse4_1"] in { @@ -1710,7 +1729,8 @@ def : ProcModel; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 45989bcd07d37..61cfda3a8d8bb 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30951,21 +30951,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. - if (SSID == SyncScope::SingleThread) - // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at - // the IR level, so we must wrap it in an intrinsic. - return nullptr; - - if (!Subtarget.hasMFence()) - // FIXME: it might make sense to use a locked operation here but on a - // different cache-line to prevent cache-line bouncing. In practice it - // is probably a small win, and x86 processors without mfence are rare - // enough that we do not bother. - return nullptr; - Function *MFence = - llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); - Builder.CreateCall(MFence, {}); + // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct + // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence + Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID); // Finally we can emit the atomic load. LoadInst *Loaded = Builder.CreateAlignedLoad( @@ -31053,7 +31042,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, // cross-thread fence. if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && FenceSSID == SyncScope::System) { - if (Subtarget.hasMFence()) + if (!Subtarget.avoidMFence() && Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll index d5c46485068a6..4deedd5726b24 100644 --- a/llvm/test/CodeGen/X86/atomic-idempotent.ll +++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll @@ -27,18 +27,16 @@ define i8 @add8(ptr %p) { ; ; X86-SLM-LABEL: add8: ; X86-SLM: # %bb.0: -; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLM-NEXT: xorl %eax, %eax -; X86-SLM-NEXT: lock xaddb %al, (%ecx) -; X86-SLM-NEXT: # kill: def $al killed $al killed $eax +; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLM-NEXT: lock orl $0, (%esp) +; X86-SLM-NEXT: movzbl (%eax), %eax ; X86-SLM-NEXT: retl ; ; X86-ATOM-LABEL: add8: ; X86-ATOM: # %bb.0: -; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-ATOM-NEXT: xorl %eax, %eax -; X86-ATOM-NEXT: lock xaddb %al, (%ecx) -; X86-ATOM-NEXT: # kill: def $al killed $al killed $eax +; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-ATOM-NEXT: lock orl $0, (%esp) +; X86-ATOM-NEXT: movzbl (%eax), %eax ; X86-ATOM-NEXT: nop ; X86-ATOM-NEXT: nop ; X86-ATOM-NEXT: retl @@ -62,26 +60,18 @@ define i16 @or16(ptr %p) { ; ; X86-SLM-LABEL: or16: ; X86-SLM: # %bb.0: -; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLM-NEXT: movzwl (%ecx), %eax -; X86-SLM-NEXT: .p2align 4, 0x90 -; X86-SLM-NEXT: .LBB1_1: # %atomicrmw.start -; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SLM-NEXT: lock cmpxchgw %ax, (%ecx) -; X86-SLM-NEXT: jne .LBB1_1 -; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end +; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLM-NEXT: lock orl $0, (%esp) +; X86-SLM-NEXT: movzwl (%eax), %eax ; X86-SLM-NEXT: retl ; ; X86-ATOM-LABEL: or16: ; X86-ATOM: # %bb.0: -; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-ATOM-NEXT: movzwl (%ecx), %eax -; X86-ATOM-NEXT: .p2align 4, 0x90 -; X86-ATOM-NEXT: .LBB1_1: # %atomicrmw.start -; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-ATOM-NEXT: lock cmpxchgw %ax, (%ecx) -; X86-ATOM-NEXT: jne .LBB1_1 -; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end +; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-ATOM-NEXT: lock orl $0, (%esp) +; X86-ATOM-NEXT: movzwl (%eax), %eax +; X86-ATOM-NEXT: nop +; X86-ATOM-NEXT: nop ; X86-ATOM-NEXT: retl %1 = atomicrmw or ptr %p, i16 0 acquire ret i16 %1 @@ -103,26 +93,18 @@ define i32 @xor32(ptr %p) { ; ; X86-SLM-LABEL: xor32: ; X86-SLM: # %bb.0: -; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLM-NEXT: movl (%ecx), %eax -; X86-SLM-NEXT: .p2align 4, 0x90 -; X86-SLM-NEXT: .LBB2_1: # %atomicrmw.start -; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx) -; X86-SLM-NEXT: jne .LBB2_1 -; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end +; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLM-NEXT: lock orl $0, (%esp) +; X86-SLM-NEXT: movl (%eax), %eax ; X86-SLM-NEXT: retl ; ; X86-ATOM-LABEL: xor32: ; X86-ATOM: # %bb.0: -; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-ATOM-NEXT: movl (%ecx), %eax -; X86-ATOM-NEXT: .p2align 4, 0x90 -; X86-ATOM-NEXT: .LBB2_1: # %atomicrmw.start -; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx) -; X86-ATOM-NEXT: jne .LBB2_1 -; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end +; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-ATOM-NEXT: lock orl $0, (%esp) +; X86-ATOM-NEXT: movl (%eax), %eax +; X86-ATOM-NEXT: nop +; X86-ATOM-NEXT: nop ; X86-ATOM-NEXT: retl %1 = atomicrmw xor ptr %p, i32 0 release ret i32 %1 @@ -318,26 +300,18 @@ define i32 @and32 (ptr %p) { ; ; X86-SLM-LABEL: and32: ; X86-SLM: # %bb.0: -; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLM-NEXT: movl (%ecx), %eax -; X86-SLM-NEXT: .p2align 4, 0x90 -; X86-SLM-NEXT: .LBB5_1: # %atomicrmw.start -; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx) -; X86-SLM-NEXT: jne .LBB5_1 -; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end +; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLM-NEXT: lock orl $0, (%esp) +; X86-SLM-NEXT: movl (%eax), %eax ; X86-SLM-NEXT: retl ; ; X86-ATOM-LABEL: and32: ; X86-ATOM: # %bb.0: -; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-ATOM-NEXT: movl (%ecx), %eax -; X86-ATOM-NEXT: .p2align 4, 0x90 -; X86-ATOM-NEXT: .LBB5_1: # %atomicrmw.start -; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx) -; X86-ATOM-NEXT: jne .LBB5_1 -; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end +; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-ATOM-NEXT: lock orl $0, (%esp) +; X86-ATOM-NEXT: movl (%eax), %eax +; X86-ATOM-NEXT: nop +; X86-ATOM-NEXT: nop ; X86-ATOM-NEXT: retl %1 = atomicrmw and ptr %p, i32 -1 acq_rel ret i32 %1 diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index 3fb994cdb751a..ff101b9037f0e 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake | FileCheck --check-prefixes=CHECK,CHECK-O0 %s ; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake | FileCheck --check-prefixes=CHECK,CHECK-O3 %s +; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -mattr=-avoid-mfence | FileCheck --check-prefixes=CHECK,CHECK-MFENCE %s define i8 @load_i8(ptr %ptr) { ; CHECK-O0-LABEL: load_i8: @@ -12,6 +13,11 @@ define i8 @load_i8(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i8: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i8, ptr %ptr unordered, align 1 ret i8 %v } @@ -27,6 +33,11 @@ define void @store_i8(ptr %ptr, i8 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movb %sil, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: store_i8: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movb %sil, (%rdi) +; CHECK-MFENCE-NEXT: retq store atomic i8 %v, ptr %ptr unordered, align 1 ret void } @@ -41,6 +52,11 @@ define i16 @load_i16(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzwl (%rdi), %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i16: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i16, ptr %ptr unordered, align 2 ret i16 %v } @@ -57,6 +73,11 @@ define void @store_i16(ptr %ptr, i16 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movw %si, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: store_i16: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movw %si, (%rdi) +; CHECK-MFENCE-NEXT: retq store atomic i16 %v, ptr %ptr unordered, align 2 ret void } @@ -116,6 +137,11 @@ define void @narrow_writeback_or(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: orq $7, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: narrow_writeback_or: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: orq $7, (%rdi) +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %ptr unordered, align 8 %v.new = or i64 %v, 7 store atomic i64 %v.new, ptr %ptr unordered, align 8 @@ -138,6 +164,12 @@ define void @narrow_writeback_and(ptr %ptr) { ; CHECK-O3-NEXT: movl $4294967040, %eax # imm = 0xFFFFFF00 ; CHECK-O3-NEXT: andq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: narrow_writeback_and: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movl $4294967040, %eax # imm = 0xFFFFFF00 +; CHECK-MFENCE-NEXT: andq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %ptr unordered, align 8 %v.new = and i64 %v, 4294967040 ;; 0xFFFF_FF00 store atomic i64 %v.new, ptr %ptr unordered, align 8 @@ -157,6 +189,11 @@ define void @narrow_writeback_xor(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: xorq $7, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: narrow_writeback_xor: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: xorq $7, (%rdi) +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %ptr unordered, align 8 %v.new = xor i64 %v, 7 store atomic i64 %v.new, ptr %ptr unordered, align 8 @@ -254,6 +291,14 @@ define void @store_i128(ptr %ptr, i128 %v) { ; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: store_i128: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: vmovq %rdx, %xmm0 +; CHECK-MFENCE-NEXT: vmovq %rsi, %xmm1 +; CHECK-MFENCE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-MFENCE-NEXT: vmovdqa %xmm0, (%rdi) +; CHECK-MFENCE-NEXT: retq store atomic i128 %v, ptr %ptr unordered, align 16 ret void } @@ -305,6 +350,28 @@ define i256 @load_i256(ptr %ptr) { ; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O3-NEXT: vzeroupper ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i256: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: pushq %rbx +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 16 +; CHECK-MFENCE-NEXT: subq $32, %rsp +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 48 +; CHECK-MFENCE-NEXT: .cfi_offset %rbx, -16 +; CHECK-MFENCE-NEXT: movq %rdi, %rbx +; CHECK-MFENCE-NEXT: movq %rsp, %rdx +; CHECK-MFENCE-NEXT: movl $32, %edi +; CHECK-MFENCE-NEXT: xorl %ecx, %ecx +; CHECK-MFENCE-NEXT: callq __atomic_load@PLT +; CHECK-MFENCE-NEXT: vmovups (%rsp), %ymm0 +; CHECK-MFENCE-NEXT: vmovups %ymm0, (%rbx) +; CHECK-MFENCE-NEXT: movq %rbx, %rax +; CHECK-MFENCE-NEXT: addq $32, %rsp +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 16 +; CHECK-MFENCE-NEXT: popq %rbx +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 8 +; CHECK-MFENCE-NEXT: vzeroupper +; CHECK-MFENCE-NEXT: retq %v = load atomic i256, ptr %ptr unordered, align 16 ret i256 %v } @@ -345,6 +412,24 @@ define void @store_i256(ptr %ptr, i256 %v) { ; CHECK-O3-NEXT: addq $40, %rsp ; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: store_i256: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: subq $40, %rsp +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 48 +; CHECK-MFENCE-NEXT: movq %rdi, %rax +; CHECK-MFENCE-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; CHECK-MFENCE-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; CHECK-MFENCE-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; CHECK-MFENCE-NEXT: movq %rsi, (%rsp) +; CHECK-MFENCE-NEXT: movq %rsp, %rdx +; CHECK-MFENCE-NEXT: movl $32, %edi +; CHECK-MFENCE-NEXT: movq %rax, %rsi +; CHECK-MFENCE-NEXT: xorl %ecx, %ecx +; CHECK-MFENCE-NEXT: callq __atomic_store@PLT +; CHECK-MFENCE-NEXT: addq $40, %rsp +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 8 +; CHECK-MFENCE-NEXT: retq store atomic i256 %v, ptr %ptr unordered, align 16 ret void } @@ -366,6 +451,14 @@ define void @vec_store(ptr %p0, <2 x i32> %vec) { ; CHECK-O3-NEXT: movl %eax, (%rdi) ; CHECK-O3-NEXT: movl %ecx, 4(%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: vec_store: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: vmovd %xmm0, %eax +; CHECK-MFENCE-NEXT: vpextrd $1, %xmm0, %ecx +; CHECK-MFENCE-NEXT: movl %eax, (%rdi) +; CHECK-MFENCE-NEXT: movl %ecx, 4(%rdi) +; CHECK-MFENCE-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %v2 = extractelement <2 x i32> %vec, i32 1 %p1 = getelementptr i32, ptr %p0, i64 1 @@ -391,6 +484,14 @@ define void @vec_store_unaligned(ptr %p0, <2 x i32> %vec) { ; CHECK-O3-NEXT: movl %eax, (%rdi) ; CHECK-O3-NEXT: movl %ecx, 4(%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: vec_store_unaligned: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: vmovd %xmm0, %eax +; CHECK-MFENCE-NEXT: vpextrd $1, %xmm0, %ecx +; CHECK-MFENCE-NEXT: movl %eax, (%rdi) +; CHECK-MFENCE-NEXT: movl %ecx, 4(%rdi) +; CHECK-MFENCE-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %v2 = extractelement <2 x i32> %vec, i32 1 %p1 = getelementptr i32, ptr %p0, i64 1 @@ -496,6 +597,12 @@ define i64 @load_fold_add3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: addq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_add3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: addq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = add i64 %v, %v2 @@ -515,6 +622,12 @@ define i64 @load_fold_sub1(ptr %p) { ; CHECK-O3-NEXT: movq (%rdi), %rax ; CHECK-O3-NEXT: addq $-15, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_sub1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: addq $-15, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = sub i64 %v, 15 ret i64 %ret @@ -556,6 +669,13 @@ define i64 @load_fold_mul1(ptr %p) { ; CHECK-O3-NEXT: leaq (%rax,%rax,4), %rax ; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_mul1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,4), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = mul i64 %v, 15 ret i64 %ret @@ -584,6 +704,12 @@ define i64 @load_fold_mul3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: imulq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_mul3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: imulq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = mul i64 %v, %v2 @@ -613,6 +739,20 @@ define i64 @load_fold_sdiv1(ptr %p) { ; CHECK-O3-NEXT: addq %rax, %rcx ; CHECK-O3-NEXT: movq %rcx, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_sdiv1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rcx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: imulq %rdx +; CHECK-MFENCE-NEXT: addq %rdx, %rcx +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: shrq $63, %rax +; CHECK-MFENCE-NEXT: sarq $3, %rcx +; CHECK-MFENCE-NEXT: addq %rax, %rcx +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = sdiv i64 %v, 15 ret i64 %ret @@ -644,6 +784,24 @@ define i64 @load_fold_sdiv2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_sdiv2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB35_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rsi +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB35_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = sdiv i64 %v, %v2 ret i64 %ret @@ -675,6 +833,25 @@ define i64 @load_fold_sdiv3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_sdiv3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq (%rsi), %rcx +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: orq %rcx, %rdx +; CHECK-MFENCE-NEXT: shrq $32, %rdx +; CHECK-MFENCE-NEXT: je .LBB36_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rcx +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB36_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %ecx +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = sdiv i64 %v, %v2 @@ -699,6 +876,14 @@ define i64 @load_fold_udiv1(ptr %p) { ; CHECK-O3-NEXT: mulxq %rax, %rax, %rax ; CHECK-O3-NEXT: shrq $3, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_udiv1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rdx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: mulxq %rax, %rax, %rax +; CHECK-MFENCE-NEXT: shrq $3, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = udiv i64 %v, 15 ret i64 %ret @@ -730,6 +915,24 @@ define i64 @load_fold_udiv2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_udiv2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB38_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rsi +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB38_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = udiv i64 %v, %v2 ret i64 %ret @@ -762,6 +965,25 @@ define i64 @load_fold_udiv3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_udiv3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq (%rsi), %rcx +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: orq %rcx, %rdx +; CHECK-MFENCE-NEXT: shrq $32, %rdx +; CHECK-MFENCE-NEXT: je .LBB39_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rcx +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB39_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %ecx +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = udiv i64 %v, %v2 @@ -795,6 +1017,23 @@ define i64 @load_fold_srem1(ptr %p) { ; CHECK-O3-NEXT: subq %rax, %rcx ; CHECK-O3-NEXT: movq %rcx, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_srem1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rcx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: imulq %rdx +; CHECK-MFENCE-NEXT: addq %rcx, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: shrq $63, %rax +; CHECK-MFENCE-NEXT: sarq $3, %rdx +; CHECK-MFENCE-NEXT: addq %rax, %rdx +; CHECK-MFENCE-NEXT: leaq (%rdx,%rdx,4), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax +; CHECK-MFENCE-NEXT: subq %rax, %rcx +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = srem i64 %v, 15 ret i64 %ret @@ -828,6 +1067,25 @@ define i64 @load_fold_srem2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_srem2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB41_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rsi +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB41_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: movl %edx, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = srem i64 %v, %v2 ret i64 %ret @@ -861,6 +1119,26 @@ define i64 @load_fold_srem3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_srem3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq (%rsi), %rcx +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: orq %rcx, %rdx +; CHECK-MFENCE-NEXT: shrq $32, %rdx +; CHECK-MFENCE-NEXT: je .LBB42_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rcx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB42_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %ecx +; CHECK-MFENCE-NEXT: movl %edx, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = srem i64 %v, %v2 @@ -890,6 +1168,18 @@ define i64 @load_fold_urem1(ptr %p) { ; CHECK-O3-NEXT: leaq (%rcx,%rcx,2), %rcx ; CHECK-O3-NEXT: subq %rcx, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_urem1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: mulxq %rcx, %rcx, %rcx +; CHECK-MFENCE-NEXT: shrq $3, %rcx +; CHECK-MFENCE-NEXT: leaq (%rcx,%rcx,4), %rcx +; CHECK-MFENCE-NEXT: leaq (%rcx,%rcx,2), %rcx +; CHECK-MFENCE-NEXT: subq %rcx, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = urem i64 %v, 15 ret i64 %ret @@ -924,6 +1214,25 @@ define i64 @load_fold_urem2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_urem2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB44_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rsi +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB44_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: movl %edx, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = urem i64 %v, %v2 ret i64 %ret @@ -958,6 +1267,26 @@ define i64 @load_fold_urem3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_urem3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq (%rsi), %rcx +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: orq %rcx, %rdx +; CHECK-MFENCE-NEXT: shrq $32, %rdx +; CHECK-MFENCE-NEXT: je .LBB45_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rcx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB45_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %ecx +; CHECK-MFENCE-NEXT: movl %edx, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = urem i64 %v, %v2 @@ -989,6 +1318,11 @@ define i64 @load_fold_shl2(ptr %p, i64 %v2) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: shlxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_shl2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: shlxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = shl i64 %v, %v2 ret i64 %ret @@ -1008,6 +1342,12 @@ define i64 @load_fold_shl3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: shlxq %rax, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_shl3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: shlxq %rax, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = shl i64 %v, %v2 @@ -1039,6 +1379,11 @@ define i64 @load_fold_lshr2(ptr %p, i64 %v2) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: shrxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_lshr2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: shrxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = lshr i64 %v, %v2 ret i64 %ret @@ -1058,6 +1403,12 @@ define i64 @load_fold_lshr3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: shrxq %rax, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_lshr3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: shrxq %rax, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = lshr i64 %v, %v2 @@ -1089,6 +1440,11 @@ define i64 @load_fold_ashr2(ptr %p, i64 %v2) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: sarxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_ashr2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: sarxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = ashr i64 %v, %v2 ret i64 %ret @@ -1108,6 +1464,12 @@ define i64 @load_fold_ashr3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: sarxq %rax, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_ashr3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: sarxq %rax, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = ashr i64 %v, %v2 @@ -1127,6 +1489,12 @@ define i64 @load_fold_and1(ptr %p) { ; CHECK-O3-NEXT: movq (%rdi), %rax ; CHECK-O3-NEXT: andl $15, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_and1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: andl $15, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = and i64 %v, 15 ret i64 %ret @@ -1155,6 +1523,12 @@ define i64 @load_fold_and3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: andq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_and3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: andq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = and i64 %v, %v2 @@ -1196,6 +1570,12 @@ define i64 @load_fold_or3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: orq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_or3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: orq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = or i64 %v, %v2 @@ -1237,6 +1617,12 @@ define i64 @load_fold_xor3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: xorq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_xor3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: xorq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = xor i64 %v, %v2 @@ -1256,6 +1642,12 @@ define i1 @load_fold_icmp1(ptr %p) { ; CHECK-O3-NEXT: cmpq $15, (%rdi) ; CHECK-O3-NEXT: sete %al ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_icmp1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: cmpq $15, (%rdi) +; CHECK-MFENCE-NEXT: sete %al +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = icmp eq i64 %v, 15 ret i1 %ret @@ -1274,6 +1666,12 @@ define i1 @load_fold_icmp2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: cmpq %rsi, (%rdi) ; CHECK-O3-NEXT: sete %al ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_icmp2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: cmpq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: sete %al +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = icmp eq i64 %v, %v2 ret i1 %ret @@ -1294,6 +1692,13 @@ define i1 @load_fold_icmp3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: cmpq %rax, (%rdi) ; CHECK-O3-NEXT: sete %al ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_icmp3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: cmpq %rax, (%rdi) +; CHECK-MFENCE-NEXT: sete %al +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = icmp eq i64 %v, %v2 @@ -1319,6 +1724,11 @@ define void @rmw_fold_add1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: addq $15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_add1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: addq $15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = add i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1338,6 +1748,11 @@ define void @rmw_fold_add2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: addq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_add2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: addq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = add i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1357,6 +1772,11 @@ define void @rmw_fold_sub1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: addq $-15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_sub1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: addq $-15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sub i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1376,6 +1796,11 @@ define void @rmw_fold_sub2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: subq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_sub2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: subq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sub i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1411,6 +1836,12 @@ define void @rmw_fold_mul2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: imulq (%rdi), %rsi ; CHECK-O3-NEXT: movq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_mul2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: imulq (%rdi), %rsi +; CHECK-MFENCE-NEXT: movq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = mul i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1447,6 +1878,20 @@ define void @rmw_fold_sdiv1(ptr %p, i64 %v) { ; CHECK-O3-NEXT: addq %rax, %rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_sdiv1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rcx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: imulq %rdx +; CHECK-MFENCE-NEXT: addq %rcx, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: shrq $63, %rax +; CHECK-MFENCE-NEXT: sarq $3, %rdx +; CHECK-MFENCE-NEXT: addq %rax, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sdiv i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1482,6 +1927,26 @@ define void @rmw_fold_sdiv2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_sdiv2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB74_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rsi +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB74_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sdiv i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1534,6 +1999,26 @@ define void @rmw_fold_udiv2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_udiv2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB76_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rsi +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB76_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = udiv i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1577,6 +2062,23 @@ define void @rmw_fold_srem1(ptr %p, i64 %v) { ; CHECK-O3-NEXT: subq %rax, %rcx ; CHECK-O3-NEXT: movq %rcx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_srem1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rcx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: imulq %rdx +; CHECK-MFENCE-NEXT: addq %rcx, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: shrq $63, %rax +; CHECK-MFENCE-NEXT: sarq $3, %rdx +; CHECK-MFENCE-NEXT: addq %rax, %rdx +; CHECK-MFENCE-NEXT: leaq (%rdx,%rdx,4), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax +; CHECK-MFENCE-NEXT: subq %rax, %rcx +; CHECK-MFENCE-NEXT: movq %rcx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = srem i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1612,6 +2114,26 @@ define void @rmw_fold_srem2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_srem2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB78_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rsi +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB78_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $edx killed $edx def $rdx +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = srem i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1644,6 +2166,18 @@ define void @rmw_fold_urem1(ptr %p, i64 %v) { ; CHECK-O3-NEXT: subq %rax, %rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_urem1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rdx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: mulxq %rax, %rax, %rax +; CHECK-MFENCE-NEXT: shrq $3, %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,4), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax +; CHECK-MFENCE-NEXT: subq %rax, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = urem i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1680,6 +2214,26 @@ define void @rmw_fold_urem2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_urem2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB80_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rsi +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB80_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $edx killed $edx def $rdx +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = urem i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1717,6 +2271,12 @@ define void @rmw_fold_shl2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: shlxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_shl2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: shlxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = shl i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1754,6 +2314,12 @@ define void @rmw_fold_lshr2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: shrxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_lshr2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: shrxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = lshr i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1791,6 +2357,12 @@ define void @rmw_fold_ashr2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: sarxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_ashr2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: sarxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = ashr i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1812,6 +2384,11 @@ define void @rmw_fold_and1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: andq $15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_and1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: andq $15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = and i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1831,6 +2408,11 @@ define void @rmw_fold_and2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: andq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_and2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: andq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = and i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1850,6 +2432,11 @@ define void @rmw_fold_or1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: orq $15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_or1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: orq $15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = or i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1869,6 +2456,11 @@ define void @rmw_fold_or2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: orq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_or2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: orq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = or i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1888,6 +2480,11 @@ define void @rmw_fold_xor1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: xorq $15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_xor1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: xorq $15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = xor i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1907,6 +2504,11 @@ define void @rmw_fold_xor2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: xorq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_xor2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: xorq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = xor i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1943,6 +2545,13 @@ define i32 @fold_trunc_add(ptr %p, i32 %v2) { ; CHECK-O3-NEXT: addl %esi, %eax ; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_trunc_add: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: addl %esi, %eax +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %trunc = trunc i64 %v to i32 %ret = add i32 %trunc, %v2 @@ -1964,6 +2573,13 @@ define i32 @fold_trunc_and(ptr %p, i32 %v2) { ; CHECK-O3-NEXT: andl %esi, %eax ; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_trunc_and: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: andl %esi, %eax +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %trunc = trunc i64 %v to i32 %ret = and i32 %trunc, %v2 @@ -1985,6 +2601,13 @@ define i32 @fold_trunc_or(ptr %p, i32 %v2) { ; CHECK-O3-NEXT: orl %esi, %eax ; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_trunc_or: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: orl %esi, %eax +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %trunc = trunc i64 %v to i32 %ret = or i32 %trunc, %v2 @@ -2012,6 +2635,15 @@ define i32 @split_load(ptr %p) { ; CHECK-O3-NEXT: orl %eax, %ecx ; CHECK-O3-NEXT: movzbl %cl, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: split_load: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: orl %eax, %ecx +; CHECK-MFENCE-NEXT: movzbl %cl, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %b1 = trunc i64 %v to i8 %v.shift = lshr i64 %v, 32 @@ -2093,12 +2725,26 @@ define void @dead_store(ptr %p, i64 %v) { ;; isn't violated. define i64 @nofold_fence(ptr %p) { -; CHECK-LABEL: nofold_fence: -; CHECK: # %bb.0: -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: mfence -; CHECK-NEXT: addq $15, %rax -; CHECK-NEXT: retq +; CHECK-O0-LABEL: nofold_fence: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq (%rdi), %rax +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O0-NEXT: addq $15, %rax +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: nofold_fence: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq (%rdi), %rax +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O3-NEXT: addq $15, %rax +; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: nofold_fence: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: mfence +; CHECK-MFENCE-NEXT: addq $15, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 fence seq_cst %ret = add i64 %v, 15 @@ -2148,6 +2794,12 @@ define i64 @fold_constant(i64 %arg) { ; CHECK-O3-NEXT: movq %rdi, %rax ; CHECK-O3-NEXT: addq Constant(%rip), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_constant: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq %rdi, %rax +; CHECK-MFENCE-NEXT: addq Constant(%rip), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr @Constant unordered, align 8 %ret = add i64 %v, %arg ret i64 %ret @@ -2167,12 +2819,26 @@ define i64 @fold_constant_clobber(ptr %p, i64 %arg) { } define i64 @fold_constant_fence(i64 %arg) { -; CHECK-LABEL: fold_constant_fence: -; CHECK: # %bb.0: -; CHECK-NEXT: movq Constant(%rip), %rax -; CHECK-NEXT: mfence -; CHECK-NEXT: addq %rdi, %rax -; CHECK-NEXT: retq +; CHECK-O0-LABEL: fold_constant_fence: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq Constant(%rip), %rax +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O0-NEXT: addq %rdi, %rax +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: fold_constant_fence: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq Constant(%rip), %rax +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O3-NEXT: addq %rdi, %rax +; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_constant_fence: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq Constant(%rip), %rax +; CHECK-MFENCE-NEXT: mfence +; CHECK-MFENCE-NEXT: addq %rdi, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr @Constant unordered, align 8 fence seq_cst %ret = add i64 %v, %arg @@ -2194,12 +2860,26 @@ define i64 @fold_invariant_clobber(ptr dereferenceable(8) %p, i64 %arg) { define i64 @fold_invariant_fence(ptr dereferenceable(8) %p, i64 %arg) { -; CHECK-LABEL: fold_invariant_fence: -; CHECK: # %bb.0: -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: mfence -; CHECK-NEXT: addq %rsi, %rax -; CHECK-NEXT: retq +; CHECK-O0-LABEL: fold_invariant_fence: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq (%rdi), %rax +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O0-NEXT: addq %rsi, %rax +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: fold_invariant_fence: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq (%rdi), %rax +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O3-NEXT: addq %rsi, %rax +; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_invariant_fence: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: mfence +; CHECK-MFENCE-NEXT: addq %rsi, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8, !invariant.load !{} fence seq_cst %ret = add i64 %v, %arg @@ -2222,6 +2902,12 @@ define i16 @load_i8_anyext_i16(ptr %ptr) { ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i8_anyext_i16: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax +; CHECK-MFENCE-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i8, ptr %ptr unordered, align 2 %vec = insertelement <2 x i8> undef, i8 %v, i32 0 %res = bitcast <2 x i8> %vec to i16 @@ -2239,6 +2925,11 @@ define i32 @load_i8_anyext_i32(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i8_anyext_i32: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i8, ptr %ptr unordered, align 4 %vec = insertelement <4 x i8> undef, i8 %v, i32 0 %res = bitcast <4 x i8> %vec to i32 @@ -2257,6 +2948,11 @@ define i32 @load_i16_anyext_i32(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzwl (%rdi), %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i16_anyext_i32: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i16, ptr %ptr unordered, align 4 %vec = insertelement <2 x i16> undef, i16 %v, i64 0 %res = bitcast <2 x i16> %vec to i32 @@ -2279,6 +2975,13 @@ define i64 @load_i16_anyext_i64(ptr %ptr) { ; CHECK-O3-NEXT: vmovd %eax, %xmm0 ; CHECK-O3-NEXT: vmovq %xmm0, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i16_anyext_i64: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax +; CHECK-MFENCE-NEXT: vmovd %eax, %xmm0 +; CHECK-MFENCE-NEXT: vmovq %xmm0, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i16, ptr %ptr unordered, align 8 %vec = insertelement <4 x i16> undef, i16 %v, i64 0 %res = bitcast <4 x i16> %vec to i64 @@ -2307,6 +3010,15 @@ define i16 @load_combine(ptr %p) { ; CHECK-O3-NEXT: orl %ecx, %eax ; CHECK-O3-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_combine: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzbl (%rdi), %ecx +; CHECK-MFENCE-NEXT: movzbl 1(%rdi), %eax +; CHECK-MFENCE-NEXT: shll $8, %eax +; CHECK-MFENCE-NEXT: orl %ecx, %eax +; CHECK-MFENCE-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-MFENCE-NEXT: retq %v1 = load atomic i8, ptr %p unordered, align 2 %p2 = getelementptr i8, ptr %p, i64 1 %v2 = load atomic i8, ptr %p2 unordered, align 1 @@ -2321,7 +3033,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) { ; CHECK-O0-LABEL: fold_cmp_over_fence: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movl (%rdi), %eax -; CHECK-O0-NEXT: mfence +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O0-NEXT: cmpl %eax, %esi ; CHECK-O0-NEXT: jne .LBB116_2 ; CHECK-O0-NEXT: # %bb.1: # %taken @@ -2335,7 +3047,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) { ; CHECK-O3-LABEL: fold_cmp_over_fence: ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movl (%rdi), %eax -; CHECK-O3-NEXT: mfence +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O3-NEXT: cmpl %eax, %esi ; CHECK-O3-NEXT: jne .LBB116_2 ; CHECK-O3-NEXT: # %bb.1: # %taken @@ -2344,6 +3056,19 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) { ; CHECK-O3-NEXT: .LBB116_2: # %untaken ; CHECK-O3-NEXT: xorl %eax, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_cmp_over_fence: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movl (%rdi), %eax +; CHECK-MFENCE-NEXT: mfence +; CHECK-MFENCE-NEXT: cmpl %eax, %esi +; CHECK-MFENCE-NEXT: jne .LBB116_2 +; CHECK-MFENCE-NEXT: # %bb.1: # %taken +; CHECK-MFENCE-NEXT: movb $1, %al +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB116_2: # %untaken +; CHECK-MFENCE-NEXT: xorl %eax, %eax +; CHECK-MFENCE-NEXT: retq %v2 = load atomic i32, ptr %p unordered, align 4 fence seq_cst %cmp = icmp eq i32 %v1, %v2 diff --git a/llvm/test/CodeGen/X86/mfence.ll b/llvm/test/CodeGen/X86/mfence.ll index f34657b3f240c..a7b4790bf801e 100644 --- a/llvm/test/CodeGen/X86/mfence.ll +++ b/llvm/test/CodeGen/X86/mfence.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2,+avoid-mfence | FileCheck %s --check-prefix=X64-NO-MFENCE ; It doesn't matter if an x86-64 target has specified "no-sse2"; we still can use mfence. @@ -14,6 +15,11 @@ define void @test() { ; X64: # %bb.0: ; X64-NEXT: mfence ; X64-NEXT: retq +; +; X64-NO-MFENCE-LABEL: test: +; X64-NO-MFENCE: # %bb.0: +; X64-NO-MFENCE-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; X64-NO-MFENCE-NEXT: retq fence seq_cst ret void } @@ -31,7 +37,33 @@ define i32 @fence(ptr %ptr) { ; X64-NEXT: mfence ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: retq +; +; X64-NO-MFENCE-LABEL: fence: +; X64-NO-MFENCE: # %bb.0: +; X64-NO-MFENCE-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; X64-NO-MFENCE-NEXT: movl (%rdi), %eax +; X64-NO-MFENCE-NEXT: retq %atomic = atomicrmw add ptr %ptr, i32 0 seq_cst ret i32 %atomic } +define void @mfence() nounwind { +; X32-LABEL: mfence: +; X32: # %bb.0: +; X32-NEXT: mfence +; X32-NEXT: retl +; +; X64-LABEL: mfence: +; X64: # %bb.0: +; X64-NEXT: mfence +; X64-NEXT: retq +; +; X64-NO-MFENCE-LABEL: mfence: +; X64-NO-MFENCE: # %bb.0: +; X64-NO-MFENCE-NEXT: mfence +; X64-NO-MFENCE-NEXT: retq + call void @llvm.x86.sse2.mfence() + ret void +} +declare void @llvm.x86.sse2.mfence() nounwind readnone + From 92cc29938f1339d2c796f1f0cb4c3a8958d5c3c6 Mon Sep 17 00:00:00 2001 From: Zentrik Date: Mon, 18 Nov 2024 22:14:38 +0000 Subject: [PATCH 4/4] Fix 7f17639803fdc73f9ab9bd60a315596ea8881af9 This should hopefully fix issues with 7f17639803fdc73f9ab9bd60a315596ea8881af9, at the very least Julia tests pass. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 007ffedab7df4..65a1ba419fe91 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -13895,7 +13895,7 @@ Value *BoUpSLP::vectorizeTree( if (const TreeEntry *ETE = getTreeEntry(V)) V = ETE->VectorizedValue; if (auto *IV = dyn_cast(V); - !IV || IV == Vec || IV->comesBefore(cast(Vec))) + !IV || IV == Vec || (IV->getParent() == cast(Vec)->getParent() && IV->comesBefore(cast(Vec)))) Ex = Builder.CreateExtractElement(V, ES->getIndexOperand()); else Ex = Builder.CreateExtractElement(Vec, Lane);