Skip to content

Commit dc8a41d

Browse files
committed
[ARM] Simplify address calculation for NEON load/store
The patch attempts to optimize a sequence of SIMD loads from the same base pointer: %0 = gep float*, float* base, i32 4 %1 = bitcast float* %0 to <4 x float>* %2 = load <4 x float>, <4 x float>* %1 ... %n1 = gep float*, float* base, i32 N %n2 = bitcast float* %n1 to <4 x float>* %n3 = load <4 x float>, <4 x float>* %n2 For AArch64 the compiler generates a sequence of LDR Qt, [Xn, rust-lang#16]. However, 32-bit NEON VLD1/VST1 lack the [Wn, #imm] addressing mode, so the address is computed before every ld/st instruction: add r2, r0, rust-lang#32 add r0, r0, rust-lang#16 vld1.32 {d18, d19}, [r2] vld1.32 {d22, d23}, [r0] This can be improved by computing address for the first load, and then using a post-indexed form of VLD1/VST1 to load the rest: add r0, r0, rust-lang#16 vld1.32 {d18, d19}, [r0]! vld1.32 {d22, d23}, [r0] In order to do that, the patch adds more patterns to DAGCombine: - (load (add ptr inc1)) and (add ptr inc2) are now folded if inc1 and inc2 are constants. - (or ptr inc) is now recognized as a pointer increment if ptr is sufficiently aligned. In addition to that, we now search for all possible base updates and then pick the best one. Differential Revision: https://reviews.llvm.org/D108988
1 parent 8848766 commit dc8a41d

12 files changed

+1113
-614
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

+445-209
Large diffs are not rendered by default.

llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll

+41-64
Original file line numberDiff line numberDiff line change
@@ -3,80 +3,57 @@
33
; rdar://12713765
44
; When realign-stack is set to false, make sure we are not creating stack
55
; objects that are assumed to be 64-byte aligned.
6-
@T3_retval = common global <16 x float> zeroinitializer, align 16
76

87
define void @test1(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp "no-realign-stack" {
9-
entry:
108
; CHECK-LABEL: test1:
11-
; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]]
12-
; CHECK: mov r[[R2:[0-9]+]], r[[R1]]
13-
; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]!
14-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
15-
; CHECK: add r[[R3:[0-9]+]], r[[R1]], #32
16-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
17-
; CHECK: add r[[R3:[0-9]+]], r[[R1]], #48
18-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
19-
; CHECK: mov r[[R2:[0-9]+]], sp
20-
; CHECK: add r[[R3:[0-9]+]], r[[R2]], #48
21-
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
22-
; CHECK: add r[[R4:[0-9]+]], r[[R2]], #32
23-
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128]
24-
; CHECK: mov r[[R5:[0-9]+]], r[[R2]]
25-
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]!
26-
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]
27-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]
28-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
29-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128]
30-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
31-
; CHECK: add r[[R1:[0-9]+]], r0, #48
32-
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
33-
; CHECK: add r[[R1:[0-9]+]], r0, #32
34-
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
35-
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
36-
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
9+
; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
10+
; CHECK: mov r[[NOTALIGNED:[0-9]+]], sp
11+
; CHECK: add r[[NOTALIGNED]], r[[NOTALIGNED]], #32
12+
; CHECK: add r[[PTR]], r[[PTR]], #32
13+
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
14+
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
15+
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
16+
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
17+
entry:
3718
%retval = alloca <16 x float>, align 64
38-
%0 = load <16 x float>, <16 x float>* @T3_retval, align 16
39-
store <16 x float> %0, <16 x float>* %retval
40-
%1 = load <16 x float>, <16 x float>* %retval
41-
store <16 x float> %1, <16 x float>* %agg.result, align 16
19+
%a1 = bitcast <16 x float>* %retval to float*
20+
%a2 = getelementptr inbounds float, float* %a1, i64 8
21+
%a3 = bitcast float* %a2 to <4 x float>*
22+
23+
%b1 = bitcast <16 x float>* %agg.result to float*
24+
%b2 = getelementptr inbounds float, float* %b1, i64 8
25+
%b3 = bitcast float* %b2 to <4 x float>*
26+
27+
%0 = load <4 x float>, <4 x float>* %a3, align 16
28+
%1 = load <4 x float>, <4 x float>* %b3, align 16
29+
store <4 x float> %0, <4 x float>* %b3, align 16
30+
store <4 x float> %1, <4 x float>* %a3, align 16
4231
ret void
4332
}
4433

4534
define void @test2(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp {
46-
entry:
4735
; CHECK-LABEL: test2:
48-
; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]]
49-
; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48
50-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
51-
; CHECK: add r[[R2:[0-9]+]], r[[R1]], #32
52-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
53-
; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
54-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
55-
; CHECK: mov r[[R1:[0-9]+]], sp
56-
; CHECK: orr r[[R2:[0-9]+]], r[[R1]], #16
57-
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
58-
; CHECK: mov r[[R3:[0-9]+]], #32
59-
; CHECK: mov r[[R9:[0-9]+]], r[[R1]]
60-
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128], r[[R3]]
61-
; CHECK: mov r[[R3:[0-9]+]], r[[R9]]
62-
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]!
63-
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
64-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128]
65-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
66-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
67-
; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
68-
; CHECK: add r[[R1:[0-9]+]], r0, #48
69-
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
70-
; CHECK: add r[[R1:[0-9]+]], r0, #32
71-
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
72-
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
73-
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
36+
; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
37+
; CHECK: mov r[[ALIGNED:[0-9]+]], sp
38+
; CHECK: orr r[[ALIGNED]], r[[ALIGNED]], #32
39+
; CHECK: add r[[PTR]], r[[PTR]], #32
40+
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
41+
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
42+
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
43+
; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
44+
entry:
45+
%retval = alloca <16 x float>, align 64
46+
%a1 = bitcast <16 x float>* %retval to float*
47+
%a2 = getelementptr inbounds float, float* %a1, i64 8
48+
%a3 = bitcast float* %a2 to <4 x float>*
7449

50+
%b1 = bitcast <16 x float>* %agg.result to float*
51+
%b2 = getelementptr inbounds float, float* %b1, i64 8
52+
%b3 = bitcast float* %b2 to <4 x float>*
7553

76-
%retval = alloca <16 x float>, align 64
77-
%0 = load <16 x float>, <16 x float>* @T3_retval, align 16
78-
store <16 x float> %0, <16 x float>* %retval
79-
%1 = load <16 x float>, <16 x float>* %retval
80-
store <16 x float> %1, <16 x float>* %agg.result, align 16
54+
%0 = load <4 x float>, <4 x float>* %a3, align 16
55+
%1 = load <4 x float>, <4 x float>* %b3, align 16
56+
store <4 x float> %0, <4 x float>* %b3, align 16
57+
store <4 x float> %1, <4 x float>* %a3, align 16
8158
ret void
8259
}

0 commit comments

Comments
 (0)