Skip to content

Commit af93c5a

Browse files
vchuravygiordano
authored andcommitted
[X86] Prefer lock or over mfence (llvm#106555)
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction. (cherry picked from commit 4d502dd) (cherry picked from commit 707ca0e)
1 parent 94c2702 commit af93c5a

File tree

5 files changed

+847
-107
lines changed

5 files changed

+847
-107
lines changed

Diff for: llvm/lib/Target/X86/X86.td

+36-16
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,10 @@ def TuningUseGLMDivSqrtCosts
748748
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
749749
"Target has branch hint feature">;
750750

751+
def TuningAvoidMFENCE
752+
: SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
753+
"Avoid MFENCE for fence seq_cst, and instead use lock or">;
754+
751755
//===----------------------------------------------------------------------===//
752756
// X86 CPU Families
753757
// TODO: Remove these - use general tuning features to determine codegen.
@@ -809,7 +813,8 @@ def ProcessorFeatures {
809813
TuningSlow3OpsLEA,
810814
TuningSlowDivide64,
811815
TuningSlowIncDec,
812-
TuningInsertVZEROUPPER
816+
TuningInsertVZEROUPPER,
817+
TuningAvoidMFENCE
813818
];
814819

815820
list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [
@@ -825,7 +830,8 @@ def ProcessorFeatures {
825830
TuningFastSHLDRotate,
826831
TuningFast15ByteNOP,
827832
TuningPOPCNTFalseDeps,
828-
TuningInsertVZEROUPPER
833+
TuningInsertVZEROUPPER,
834+
TuningAvoidMFENCE
829835
];
830836

831837
list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
@@ -844,7 +850,8 @@ def ProcessorFeatures {
844850
TuningPOPCNTFalseDeps,
845851
TuningLZCNTFalseDeps,
846852
TuningInsertVZEROUPPER,
847-
TuningAllowLight256Bit
853+
TuningAllowLight256Bit,
854+
TuningAvoidMFENCE
848855
];
849856

850857
list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
@@ -868,15 +875,17 @@ def ProcessorFeatures {
868875
TuningFastGather,
869876
TuningPOPCNTFalseDeps,
870877
TuningInsertVZEROUPPER,
871-
TuningAllowLight256Bit
878+
TuningAllowLight256Bit,
879+
TuningAvoidMFENCE
872880
];
873881

874882
// Nehalem
875883
list<SubtargetFeature> NHMFeatures = X86_64V2Features;
876884
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
877885
TuningSlowDivide64,
878886
TuningInsertVZEROUPPER,
879-
TuningNoDomainDelayMov];
887+
TuningNoDomainDelayMov,
888+
TuningAvoidMFENCE];
880889

881890
// Westmere
882891
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -897,7 +906,8 @@ def ProcessorFeatures {
897906
TuningFast15ByteNOP,
898907
TuningPOPCNTFalseDeps,
899908
TuningInsertVZEROUPPER,
900-
TuningNoDomainDelayMov];
909+
TuningNoDomainDelayMov,
910+
TuningAvoidMFENCE];
901911
list<SubtargetFeature> SNBFeatures =
902912
!listconcat(WSMFeatures, SNBAdditionalFeatures);
903913

@@ -963,7 +973,8 @@ def ProcessorFeatures {
963973
TuningAllowLight256Bit,
964974
TuningNoDomainDelayMov,
965975
TuningNoDomainDelayShuffle,
966-
TuningNoDomainDelayBlend];
976+
TuningNoDomainDelayBlend,
977+
TuningAvoidMFENCE];
967978
list<SubtargetFeature> SKLFeatures =
968979
!listconcat(BDWFeatures, SKLAdditionalFeatures);
969980

@@ -998,7 +1009,8 @@ def ProcessorFeatures {
9981009
TuningNoDomainDelayMov,
9991010
TuningNoDomainDelayShuffle,
10001011
TuningNoDomainDelayBlend,
1001-
TuningFastImmVectorShift];
1012+
TuningFastImmVectorShift,
1013+
TuningAvoidMFENCE];
10021014
list<SubtargetFeature> SKXFeatures =
10031015
!listconcat(BDWFeatures, SKXAdditionalFeatures);
10041016

@@ -1041,7 +1053,8 @@ def ProcessorFeatures {
10411053
TuningNoDomainDelayMov,
10421054
TuningNoDomainDelayShuffle,
10431055
TuningNoDomainDelayBlend,
1044-
TuningFastImmVectorShift];
1056+
TuningFastImmVectorShift,
1057+
TuningAvoidMFENCE];
10451058
list<SubtargetFeature> CNLFeatures =
10461059
!listconcat(SKLFeatures, CNLAdditionalFeatures);
10471060

@@ -1070,7 +1083,8 @@ def ProcessorFeatures {
10701083
TuningNoDomainDelayMov,
10711084
TuningNoDomainDelayShuffle,
10721085
TuningNoDomainDelayBlend,
1073-
TuningFastImmVectorShift];
1086+
TuningFastImmVectorShift,
1087+
TuningAvoidMFENCE];
10741088
list<SubtargetFeature> ICLFeatures =
10751089
!listconcat(CNLFeatures, ICLAdditionalFeatures);
10761090

@@ -1216,7 +1230,8 @@ def ProcessorFeatures {
12161230
// Tremont
12171231
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
12181232
FeatureGFNI];
1219-
list<SubtargetFeature> TRMTuning = GLPTuning;
1233+
list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
1234+
list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
12201235
list<SubtargetFeature> TRMFeatures =
12211236
!listconcat(GLPFeatures, TRMAdditionalFeatures);
12221237

@@ -1394,7 +1409,8 @@ def ProcessorFeatures {
13941409
TuningFastImm16,
13951410
TuningSBBDepBreaking,
13961411
TuningSlowDivide64,
1397-
TuningSlowSHLD];
1412+
TuningSlowSHLD,
1413+
TuningAvoidMFENCE];
13981414
list<SubtargetFeature> BtVer2Features =
13991415
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);
14001416

@@ -1423,7 +1439,8 @@ def ProcessorFeatures {
14231439
TuningFastScalarShiftMasks,
14241440
TuningBranchFusion,
14251441
TuningSBBDepBreaking,
1426-
TuningInsertVZEROUPPER];
1442+
TuningInsertVZEROUPPER,
1443+
TuningAvoidMFENCE];
14271444

14281445
// PileDriver
14291446
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -1503,7 +1520,8 @@ def ProcessorFeatures {
15031520
TuningSlowSHLD,
15041521
TuningSBBDepBreaking,
15051522
TuningInsertVZEROUPPER,
1506-
TuningAllowLight256Bit];
1523+
TuningAllowLight256Bit,
1524+
TuningAvoidMFENCE];
15071525
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
15081526
FeatureRDPID,
15091527
FeatureRDPRU,
@@ -1691,7 +1709,8 @@ def : ProcModel<P, SandyBridgeModel, [
16911709
[
16921710
TuningMacroFusion,
16931711
TuningSlowUAMem16,
1694-
TuningInsertVZEROUPPER
1712+
TuningInsertVZEROUPPER,
1713+
TuningAvoidMFENCE
16951714
]>;
16961715
}
16971716
foreach P = ["penryn", "core_2_duo_sse4_1"] in {
@@ -1710,7 +1729,8 @@ def : ProcModel<P, SandyBridgeModel, [
17101729
[
17111730
TuningMacroFusion,
17121731
TuningSlowUAMem16,
1713-
TuningInsertVZEROUPPER
1732+
TuningInsertVZEROUPPER,
1733+
TuningAvoidMFENCE
17141734
]>;
17151735
}
17161736

Diff for: llvm/lib/Target/X86/X86ISelLowering.cpp

+4-15
Original file line numberDiff line numberDiff line change
@@ -30951,21 +30951,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
3095130951
// otherwise, we might be able to be more aggressive on relaxed idempotent
3095230952
// rmw. In practice, they do not look useful, so we don't try to be
3095330953
// especially clever.
30954-
if (SSID == SyncScope::SingleThread)
30955-
// FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
30956-
// the IR level, so we must wrap it in an intrinsic.
30957-
return nullptr;
30958-
30959-
if (!Subtarget.hasMFence())
30960-
// FIXME: it might make sense to use a locked operation here but on a
30961-
// different cache-line to prevent cache-line bouncing. In practice it
30962-
// is probably a small win, and x86 processors without mfence are rare
30963-
// enough that we do not bother.
30964-
return nullptr;
3096530954

30966-
Function *MFence =
30967-
llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
30968-
Builder.CreateCall(MFence, {});
30955+
// Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
30956+
// lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
30957+
Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
3096930958

3097030959
// Finally we can emit the atomic load.
3097130960
LoadInst *Loaded = Builder.CreateAlignedLoad(
@@ -31053,7 +31042,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
3105331042
// cross-thread fence.
3105431043
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
3105531044
FenceSSID == SyncScope::System) {
31056-
if (Subtarget.hasMFence())
31045+
if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
3105731046
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
3105831047

3105931048
SDValue Chain = Op.getOperand(0);

Diff for: llvm/test/CodeGen/X86/atomic-idempotent.ll

+30-56
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,16 @@ define i8 @add8(ptr %p) {
2727
;
2828
; X86-SLM-LABEL: add8:
2929
; X86-SLM: # %bb.0:
30-
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
31-
; X86-SLM-NEXT: xorl %eax, %eax
32-
; X86-SLM-NEXT: lock xaddb %al, (%ecx)
33-
; X86-SLM-NEXT: # kill: def $al killed $al killed $eax
30+
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
31+
; X86-SLM-NEXT: lock orl $0, (%esp)
32+
; X86-SLM-NEXT: movzbl (%eax), %eax
3433
; X86-SLM-NEXT: retl
3534
;
3635
; X86-ATOM-LABEL: add8:
3736
; X86-ATOM: # %bb.0:
38-
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
39-
; X86-ATOM-NEXT: xorl %eax, %eax
40-
; X86-ATOM-NEXT: lock xaddb %al, (%ecx)
41-
; X86-ATOM-NEXT: # kill: def $al killed $al killed $eax
37+
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
38+
; X86-ATOM-NEXT: lock orl $0, (%esp)
39+
; X86-ATOM-NEXT: movzbl (%eax), %eax
4240
; X86-ATOM-NEXT: nop
4341
; X86-ATOM-NEXT: nop
4442
; X86-ATOM-NEXT: retl
@@ -62,26 +60,18 @@ define i16 @or16(ptr %p) {
6260
;
6361
; X86-SLM-LABEL: or16:
6462
; X86-SLM: # %bb.0:
65-
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
66-
; X86-SLM-NEXT: movzwl (%ecx), %eax
67-
; X86-SLM-NEXT: .p2align 4, 0x90
68-
; X86-SLM-NEXT: .LBB1_1: # %atomicrmw.start
69-
; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
70-
; X86-SLM-NEXT: lock cmpxchgw %ax, (%ecx)
71-
; X86-SLM-NEXT: jne .LBB1_1
72-
; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
63+
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
64+
; X86-SLM-NEXT: lock orl $0, (%esp)
65+
; X86-SLM-NEXT: movzwl (%eax), %eax
7366
; X86-SLM-NEXT: retl
7467
;
7568
; X86-ATOM-LABEL: or16:
7669
; X86-ATOM: # %bb.0:
77-
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
78-
; X86-ATOM-NEXT: movzwl (%ecx), %eax
79-
; X86-ATOM-NEXT: .p2align 4, 0x90
80-
; X86-ATOM-NEXT: .LBB1_1: # %atomicrmw.start
81-
; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
82-
; X86-ATOM-NEXT: lock cmpxchgw %ax, (%ecx)
83-
; X86-ATOM-NEXT: jne .LBB1_1
84-
; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
70+
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
71+
; X86-ATOM-NEXT: lock orl $0, (%esp)
72+
; X86-ATOM-NEXT: movzwl (%eax), %eax
73+
; X86-ATOM-NEXT: nop
74+
; X86-ATOM-NEXT: nop
8575
; X86-ATOM-NEXT: retl
8676
%1 = atomicrmw or ptr %p, i16 0 acquire
8777
ret i16 %1
@@ -103,26 +93,18 @@ define i32 @xor32(ptr %p) {
10393
;
10494
; X86-SLM-LABEL: xor32:
10595
; X86-SLM: # %bb.0:
106-
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
107-
; X86-SLM-NEXT: movl (%ecx), %eax
108-
; X86-SLM-NEXT: .p2align 4, 0x90
109-
; X86-SLM-NEXT: .LBB2_1: # %atomicrmw.start
110-
; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
111-
; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
112-
; X86-SLM-NEXT: jne .LBB2_1
113-
; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
96+
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
97+
; X86-SLM-NEXT: lock orl $0, (%esp)
98+
; X86-SLM-NEXT: movl (%eax), %eax
11499
; X86-SLM-NEXT: retl
115100
;
116101
; X86-ATOM-LABEL: xor32:
117102
; X86-ATOM: # %bb.0:
118-
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
119-
; X86-ATOM-NEXT: movl (%ecx), %eax
120-
; X86-ATOM-NEXT: .p2align 4, 0x90
121-
; X86-ATOM-NEXT: .LBB2_1: # %atomicrmw.start
122-
; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
123-
; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
124-
; X86-ATOM-NEXT: jne .LBB2_1
125-
; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
103+
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
104+
; X86-ATOM-NEXT: lock orl $0, (%esp)
105+
; X86-ATOM-NEXT: movl (%eax), %eax
106+
; X86-ATOM-NEXT: nop
107+
; X86-ATOM-NEXT: nop
126108
; X86-ATOM-NEXT: retl
127109
%1 = atomicrmw xor ptr %p, i32 0 release
128110
ret i32 %1
@@ -318,26 +300,18 @@ define i32 @and32 (ptr %p) {
318300
;
319301
; X86-SLM-LABEL: and32:
320302
; X86-SLM: # %bb.0:
321-
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
322-
; X86-SLM-NEXT: movl (%ecx), %eax
323-
; X86-SLM-NEXT: .p2align 4, 0x90
324-
; X86-SLM-NEXT: .LBB5_1: # %atomicrmw.start
325-
; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
326-
; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
327-
; X86-SLM-NEXT: jne .LBB5_1
328-
; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
303+
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
304+
; X86-SLM-NEXT: lock orl $0, (%esp)
305+
; X86-SLM-NEXT: movl (%eax), %eax
329306
; X86-SLM-NEXT: retl
330307
;
331308
; X86-ATOM-LABEL: and32:
332309
; X86-ATOM: # %bb.0:
333-
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
334-
; X86-ATOM-NEXT: movl (%ecx), %eax
335-
; X86-ATOM-NEXT: .p2align 4, 0x90
336-
; X86-ATOM-NEXT: .LBB5_1: # %atomicrmw.start
337-
; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
338-
; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
339-
; X86-ATOM-NEXT: jne .LBB5_1
340-
; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
310+
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
311+
; X86-ATOM-NEXT: lock orl $0, (%esp)
312+
; X86-ATOM-NEXT: movl (%eax), %eax
313+
; X86-ATOM-NEXT: nop
314+
; X86-ATOM-NEXT: nop
341315
; X86-ATOM-NEXT: retl
342316
%1 = atomicrmw and ptr %p, i32 -1 acq_rel
343317
ret i32 %1

0 commit comments

Comments
 (0)