Skip to content

Commit 175850f

Browse files
committed
[AArch64][SVE2] Combine trunc+add+lsr to rshrnb
The example sequence add z0.h, z0.h, rust-lang#32 lsr z0.h, rust-lang#6 st1b z0.h, x1 can be replaced with rshrnb z0.b, rust-lang#6 st1b z0.h, x1 As the top half of the destination elements are truncated. In similar fashion, add z0.s, z0.s, rust-lang#32 lsr z1.s, z1.s, rust-lang#6 add z1.s, z1.s, rust-lang#32 lsr z0.s, z0.s, rust-lang#6 uzp1 z0.h, z0.h, z1.h Can be replaced with rshrnb z1.h, z1.s, rust-lang#6 rshrnb z0.h, z0.s, rust-lang#6 uzp1 z0.h, z0.h, z1.h Differential Revision: https://reviews.llvm.org/D155299
1 parent cc488b8 commit 175850f

File tree

5 files changed

+319
-3
lines changed

5 files changed

+319
-3
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+72-2
Original file line numberDiff line numberDiff line change
@@ -2580,6 +2580,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
25802580
MAKE_CASE(AArch64ISD::CALL_BTI)
25812581
MAKE_CASE(AArch64ISD::MRRS)
25822582
MAKE_CASE(AArch64ISD::MSRR)
2583+
MAKE_CASE(AArch64ISD::RSHRNB_I)
25832584
}
25842585
#undef MAKE_CASE
25852586
return nullptr;
@@ -20078,7 +20079,59 @@ static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
2007820079
return SDValue();
2007920080
}
2008020081

20081-
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
20082+
// Try to simplify:
20083+
// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
20084+
// t2 = nxv8i16 srl(t1, ShiftValue)
20085+
// to
20086+
// t1 = nxv8i16 rshrnb(X, shiftvalue).
20087+
// rshrnb will zero the top half bits of each element. Therefore, this combine
20088+
// should only be performed when a following instruction with the rshrnb
20089+
// as an operand does not care about the top half of each element. For example,
20090+
// a uzp1 or a truncating store.
20091+
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,
20092+
const AArch64Subtarget *Subtarget) {
20093+
EVT VT = Srl->getValueType(0);
20094+
20095+
if (!VT.isScalableVector() || !Subtarget->hasSVE2() ||
20096+
Srl->getOpcode() != ISD::SRL)
20097+
return SDValue();
20098+
20099+
EVT ResVT;
20100+
if (VT == MVT::nxv8i16)
20101+
ResVT = MVT::nxv16i8;
20102+
else if (VT == MVT::nxv4i32)
20103+
ResVT = MVT::nxv8i16;
20104+
else if (VT == MVT::nxv2i64)
20105+
ResVT = MVT::nxv4i32;
20106+
else
20107+
return SDValue();
20108+
20109+
auto SrlOp1 =
20110+
dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Srl->getOperand(1)));
20111+
if (!SrlOp1)
20112+
return SDValue();
20113+
unsigned ShiftValue = SrlOp1->getZExtValue();
20114+
20115+
SDValue Add = Srl->getOperand(0);
20116+
if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
20117+
return SDValue();
20118+
auto AddOp1 =
20119+
dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
20120+
if (!AddOp1)
20121+
return SDValue();
20122+
uint64_t AddValue = AddOp1->getZExtValue();
20123+
if (AddValue != 1ULL << (ShiftValue - 1))
20124+
return SDValue();
20125+
20126+
SDLoc DL(Srl);
20127+
SDValue Rshrnb = DAG.getNode(
20128+
AArch64ISD::RSHRNB_I, DL, ResVT,
20129+
{Add->getOperand(0), DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
20130+
return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
20131+
}
20132+
20133+
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
20134+
const AArch64Subtarget *Subtarget) {
2008220135
SDLoc DL(N);
2008320136
SDValue Op0 = N->getOperand(0);
2008420137
SDValue Op1 = N->getOperand(1);
@@ -20111,6 +20164,12 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
2011120164
}
2011220165
}
2011320166

20167+
if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
20168+
return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
20169+
20170+
if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
20171+
return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
20172+
2011420173
// uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
2011520174
if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
2011620175
if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
@@ -20727,6 +20786,17 @@ static SDValue performSTORECombine(SDNode *N,
2072720786
if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
2072820787
return Store;
2072920788

20789+
if (ST->isTruncatingStore())
20790+
if (SDValue Rshrnb =
20791+
trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
20792+
EVT StoreVT = ST->getMemoryVT();
20793+
if ((ValueVT == MVT::nxv8i16 && StoreVT == MVT::nxv8i8) ||
20794+
(ValueVT == MVT::nxv4i32 && StoreVT == MVT::nxv4i16) ||
20795+
(ValueVT == MVT::nxv2i64 && StoreVT == MVT::nxv2i32))
20796+
return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
20797+
StoreVT, ST->getMemOperand());
20798+
}
20799+
2073020800
return SDValue();
2073120801
}
2073220802

@@ -23044,7 +23114,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2304423114
case AArch64ISD::UUNPKHI:
2304523115
return performUnpackCombine(N, DAG, Subtarget);
2304623116
case AArch64ISD::UZP1:
23047-
return performUzpCombine(N, DAG);
23117+
return performUzpCombine(N, DAG, Subtarget);
2304823118
case AArch64ISD::SETCC_MERGE_ZERO:
2304923119
return performSetccMergeZeroCombine(N, DCI);
2305023120
case AArch64ISD::REINTERPRET_CAST:

llvm/lib/Target/AArch64/AArch64ISelLowering.h

+3
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,9 @@ enum NodeType : unsigned {
215215
SRSHR_I,
216216
URSHR_I,
217217

218+
// Vector narrowing shift by immediate (bottom)
219+
RSHRNB_I,
220+
218221
// Vector shift by constant and insert
219222
VSLI,
220223
VSRI,

llvm/lib/Target/AArch64/AArch64InstrInfo.td

+6
Original file line numberDiff line numberDiff line change
@@ -822,6 +822,12 @@ def AArch64mrs : SDNode<"AArch64ISD::MRS",
822822
SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
823823
[SDNPHasChain, SDNPOutGlue]>;
824824

825+
def SD_AArch64rshrnb : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<2>]>;
826+
def AArch64rshrnb : SDNode<"AArch64ISD::RSHRNB_I", SD_AArch64rshrnb>;
827+
def AArch64rshrnb_pf : PatFrags<(ops node:$rs, node:$i),
828+
[(AArch64rshrnb node:$rs, node:$i),
829+
(int_aarch64_sve_rshrnb node:$rs, node:$i)]>;
830+
825831
// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands
826832
// have no common bits.
827833
def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs),

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

+1-1
Original file line numberDiff line numberDiff line change
@@ -3524,7 +3524,7 @@ let Predicates = [HasSVE2orSME] in {
35243524
defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb", int_aarch64_sve_sqshrunb>;
35253525
defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb", int_aarch64_sve_sqrshrunb>;
35263526
defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb", int_aarch64_sve_shrnb>;
3527-
defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb", int_aarch64_sve_rshrnb>;
3527+
defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb", AArch64rshrnb_pf>;
35283528
defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb", int_aarch64_sve_sqshrnb>;
35293529
defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb", int_aarch64_sve_sqrshrnb>;
35303530
defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb", int_aarch64_sve_uqshrnb>;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
3+
4+
define void @add_lshr_rshrnb_b_6(ptr %ptr, ptr %dst, i64 %index){
5+
; CHECK-LABEL: add_lshr_rshrnb_b_6:
6+
; CHECK: // %bb.0:
7+
; CHECK-NEXT: ptrue p0.h
8+
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
9+
; CHECK-NEXT: rshrnb z0.b, z0.h, #6
10+
; CHECK-NEXT: st1b { z0.h }, p0, [x1, x2]
11+
; CHECK-NEXT: ret
12+
%load = load <vscale x 8 x i16>, ptr %ptr, align 2
13+
%1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
14+
%2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
15+
%3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
16+
%4 = getelementptr inbounds i8, ptr %dst, i64 %index
17+
store <vscale x 8 x i8> %3, ptr %4, align 1
18+
ret void
19+
}
20+
21+
define void @neg_add_lshr_rshrnb_b_6(ptr %ptr, ptr %dst, i64 %index){
22+
; CHECK-LABEL: neg_add_lshr_rshrnb_b_6:
23+
; CHECK: // %bb.0:
24+
; CHECK-NEXT: ptrue p0.h
25+
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
26+
; CHECK-NEXT: add z0.h, z0.h, #1 // =0x1
27+
; CHECK-NEXT: lsr z0.h, z0.h, #6
28+
; CHECK-NEXT: st1b { z0.h }, p0, [x1, x2]
29+
; CHECK-NEXT: ret
30+
%load = load <vscale x 8 x i16>, ptr %ptr, align 2
31+
%1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
32+
%2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
33+
%3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
34+
%4 = getelementptr inbounds i8, ptr %dst, i64 %index
35+
store <vscale x 8 x i8> %3, ptr %4, align 1
36+
ret void
37+
}
38+
39+
define void @add_lshr_rshrnb_h_7(ptr %ptr, ptr %dst, i64 %index){
40+
; CHECK-LABEL: add_lshr_rshrnb_h_7:
41+
; CHECK: // %bb.0:
42+
; CHECK-NEXT: ptrue p0.h
43+
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
44+
; CHECK-NEXT: rshrnb z0.b, z0.h, #7
45+
; CHECK-NEXT: st1b { z0.h }, p0, [x1, x2]
46+
; CHECK-NEXT: ret
47+
%load = load <vscale x 8 x i16>, ptr %ptr, align 2
48+
%1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 64, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
49+
%2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 7, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
50+
%3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
51+
%4 = getelementptr inbounds i8, ptr %dst, i64 %index
52+
store <vscale x 8 x i8> %3, ptr %4, align 1
53+
ret void
54+
}
55+
56+
define void @add_lshr_rshrn_h_6(ptr %ptr, ptr %dst, i64 %index){
57+
; CHECK-LABEL: add_lshr_rshrn_h_6:
58+
; CHECK: // %bb.0:
59+
; CHECK-NEXT: ptrue p0.s
60+
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
61+
; CHECK-NEXT: rshrnb z0.h, z0.s, #6
62+
; CHECK-NEXT: st1h { z0.s }, p0, [x1, x2, lsl #1]
63+
; CHECK-NEXT: ret
64+
%load = load <vscale x 4 x i32>, ptr %ptr, align 2
65+
%1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 32, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
66+
%2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 6, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
67+
%3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
68+
%4 = getelementptr inbounds i16, ptr %dst, i64 %index
69+
store <vscale x 4 x i16> %3, ptr %4, align 1
70+
ret void
71+
}
72+
73+
define void @add_lshr_rshrnb_h_2(ptr %ptr, ptr %dst, i64 %index){
74+
; CHECK-LABEL: add_lshr_rshrnb_h_2:
75+
; CHECK: // %bb.0:
76+
; CHECK-NEXT: ptrue p0.s
77+
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
78+
; CHECK-NEXT: rshrnb z0.h, z0.s, #2
79+
; CHECK-NEXT: st1h { z0.s }, p0, [x1, x2, lsl #1]
80+
; CHECK-NEXT: ret
81+
%load = load <vscale x 4 x i32>, ptr %ptr, align 2
82+
%1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
83+
%2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
84+
%3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
85+
%4 = getelementptr inbounds i16, ptr %dst, i64 %index
86+
store <vscale x 4 x i16> %3, ptr %4, align 1
87+
ret void
88+
}
89+
90+
define void @neg_add_lshr_rshrnb_h_0(ptr %ptr, ptr %dst, i64 %index){
91+
; CHECK-LABEL: neg_add_lshr_rshrnb_h_0:
92+
; CHECK: // %bb.0:
93+
; CHECK-NEXT: ret
94+
%load = load <vscale x 4 x i32>, ptr %ptr, align 2
95+
%1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
96+
%2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 -1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
97+
%3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
98+
%4 = getelementptr inbounds i16, ptr %dst, i64 %index
99+
store <vscale x 4 x i16> %3, ptr %4, align 1
100+
ret void
101+
}
102+
103+
define void @wide_add_shift_add_rshrnb_b(ptr %dest, i64 %index, <vscale x 16 x i16> %arg1){
104+
; CHECK-LABEL: wide_add_shift_add_rshrnb_b:
105+
; CHECK: // %bb.0:
106+
; CHECK-NEXT: ptrue p0.b
107+
; CHECK-NEXT: rshrnb z1.b, z1.h, #6
108+
; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, x1]
109+
; CHECK-NEXT: rshrnb z0.b, z0.h, #6
110+
; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
111+
; CHECK-NEXT: add z0.b, z2.b, z0.b
112+
; CHECK-NEXT: st1b { z0.b }, p0, [x0, x1]
113+
; CHECK-NEXT: ret
114+
%1 = add <vscale x 16 x i16> %arg1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 32, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
115+
%2 = lshr <vscale x 16 x i16> %1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 6, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
116+
%3 = getelementptr inbounds i8, ptr %dest, i64 %index
117+
%load = load <vscale x 16 x i8>, ptr %3, align 2
118+
%4 = trunc <vscale x 16 x i16> %2 to <vscale x 16 x i8>
119+
%5 = add <vscale x 16 x i8> %load, %4
120+
store <vscale x 16 x i8> %5, ptr %3, align 2
121+
ret void
122+
}
123+
124+
define void @wide_add_shift_add_rshrnb_h(ptr %dest, i64 %index, <vscale x 8 x i32> %arg1){
125+
; CHECK-LABEL: wide_add_shift_add_rshrnb_h:
126+
; CHECK: // %bb.0:
127+
; CHECK-NEXT: ptrue p0.h
128+
; CHECK-NEXT: rshrnb z1.h, z1.s, #6
129+
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x1, lsl #1]
130+
; CHECK-NEXT: rshrnb z0.h, z0.s, #6
131+
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
132+
; CHECK-NEXT: add z0.h, z2.h, z0.h
133+
; CHECK-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1]
134+
; CHECK-NEXT: ret
135+
%1 = add <vscale x 8 x i32> %arg1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
136+
%2 = lshr <vscale x 8 x i32> %1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
137+
%3 = getelementptr inbounds i16, ptr %dest, i64 %index
138+
%load = load <vscale x 8 x i16>, ptr %3, align 2
139+
%4 = trunc <vscale x 8 x i32> %2 to <vscale x 8 x i16>
140+
%5 = add <vscale x 8 x i16> %load, %4
141+
store <vscale x 8 x i16> %5, ptr %3, align 2
142+
ret void
143+
}
144+
145+
define void @neg_trunc_lsr_add_op1_not_splat(ptr %ptr, ptr %dst, i64 %index, <vscale x 8 x i16> %add_op1){
146+
; CHECK-LABEL: neg_trunc_lsr_add_op1_not_splat:
147+
; CHECK: // %bb.0:
148+
; CHECK-NEXT: ptrue p0.h
149+
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
150+
; CHECK-NEXT: add z0.h, z1.h, z0.h
151+
; CHECK-NEXT: lsr z0.h, z0.h, #6
152+
; CHECK-NEXT: st1b { z0.h }, p0, [x1, x2]
153+
; CHECK-NEXT: ret
154+
%load = load <vscale x 8 x i16>, ptr %ptr, align 2
155+
%1 = add <vscale x 8 x i16> %load, %add_op1
156+
%2 = lshr <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 6, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
157+
%3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
158+
%4 = getelementptr inbounds i8, ptr %dst, i64 %index
159+
store <vscale x 8 x i8> %3, ptr %4, align 1
160+
ret void
161+
}
162+
163+
define void @neg_trunc_lsr_op1_not_splat(ptr %ptr, ptr %dst, i64 %index, <vscale x 8 x i16> %lshr_op1){
164+
; CHECK-LABEL: neg_trunc_lsr_op1_not_splat:
165+
; CHECK: // %bb.0:
166+
; CHECK-NEXT: ptrue p0.h
167+
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
168+
; CHECK-NEXT: add z1.h, z1.h, #32 // =0x20
169+
; CHECK-NEXT: lsrr z0.h, p0/m, z0.h, z1.h
170+
; CHECK-NEXT: st1b { z0.h }, p0, [x1, x2]
171+
; CHECK-NEXT: ret
172+
%load = load <vscale x 8 x i16>, ptr %ptr, align 2
173+
%1 = add <vscale x 8 x i16> %load, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 32, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
174+
%2 = lshr <vscale x 8 x i16> %1, %lshr_op1
175+
%3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
176+
%4 = getelementptr inbounds i8, ptr %dst, i64 %index
177+
store <vscale x 8 x i8> %3, ptr %4, align 1
178+
ret void
179+
}
180+
181+
define void @neg_add_has_two_uses(ptr %ptr, ptr %dst, ptr %dst2, i64 %index){
182+
; CHECK-LABEL: neg_add_has_two_uses:
183+
; CHECK: // %bb.0:
184+
; CHECK-NEXT: ptrue p0.h
185+
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
186+
; CHECK-NEXT: add z0.h, z0.h, #32 // =0x20
187+
; CHECK-NEXT: lsr z1.h, z0.h, #6
188+
; CHECK-NEXT: add z0.h, z0.h, z0.h
189+
; CHECK-NEXT: st1h { z0.h }, p0, [x2, x3, lsl #1]
190+
; CHECK-NEXT: st1b { z1.h }, p0, [x1, x3]
191+
; CHECK-NEXT: ret
192+
%load = load <vscale x 8 x i16>, ptr %ptr, align 2
193+
%1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
194+
%2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
195+
%3 = add <vscale x 8 x i16> %1, %1
196+
%4 = getelementptr inbounds i16, ptr %dst2, i64 %index
197+
%5 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
198+
%6 = getelementptr inbounds i8, ptr %dst, i64 %index
199+
store <vscale x 8 x i16> %3, ptr %4, align 1
200+
store <vscale x 8 x i8> %5, ptr %6, align 1
201+
ret void
202+
}
203+
204+
define void @add_lshr_rshrnb_s(ptr %ptr, ptr %dst, i64 %index){
205+
; CHECK-LABEL: add_lshr_rshrnb_s:
206+
; CHECK: // %bb.0:
207+
; CHECK-NEXT: ptrue p0.d
208+
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
209+
; CHECK-NEXT: rshrnb z0.s, z0.d, #6
210+
; CHECK-NEXT: st1w { z0.d }, p0, [x1, x2, lsl #2]
211+
; CHECK-NEXT: ret
212+
%load = load <vscale x 2 x i64>, ptr %ptr, align 2
213+
%1 = add <vscale x 2 x i64> %load, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 32, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
214+
%2 = lshr <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 6, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
215+
%3 = trunc <vscale x 2 x i64> %2 to <vscale x 2 x i32>
216+
%4 = getelementptr inbounds i32, ptr %dst, i64 %index
217+
store <vscale x 2 x i32> %3, ptr %4, align 1
218+
ret void
219+
}
220+
221+
define void @neg_add_lshr_rshrnb_s(ptr %ptr, ptr %dst, i64 %index){
222+
; CHECK-LABEL: neg_add_lshr_rshrnb_s:
223+
; CHECK: // %bb.0:
224+
; CHECK-NEXT: ptrue p0.d
225+
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
226+
; CHECK-NEXT: add z0.d, z0.d, #32 // =0x20
227+
; CHECK-NEXT: lsr z0.d, z0.d, #6
228+
; CHECK-NEXT: st1h { z0.d }, p0, [x1, x2, lsl #1]
229+
; CHECK-NEXT: ret
230+
%load = load <vscale x 2 x i64>, ptr %ptr, align 2
231+
%1 = add <vscale x 2 x i64> %load, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 32, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
232+
%2 = lshr <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 6, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
233+
%3 = trunc <vscale x 2 x i64> %2 to <vscale x 2 x i16>
234+
%4 = getelementptr inbounds i16, ptr %dst, i64 %index
235+
store <vscale x 2 x i16> %3, ptr %4, align 1
236+
ret void
237+
}

0 commit comments

Comments
 (0)