Skip to content

Commit 5b95bba

Browse files
committed
[RISCV] Set Fast flag for unaligned memory accesses
The +unaligned-scalar-mem and +unaligned-vector-mem features were added in D126085 and D149375 respectively to allow subtargets to indicate that they supported misaligned loads/stores with "sufficient" performance. This is separate from whether or not the target actually supports misaligned accesses, which could be determined from Zicclsm. This patch enables the Fast flag under the assumption that any subtarget that declares support for +unaligned-*-mem will want to opt into optimisations that take advantage of misaligned scalar accesses, such as store merging. Reviewed By: reames Differential Revision: https://reviews.llvm.org/D150771
1 parent 587b8f3 commit 5b95bba

File tree

5 files changed

+269
-171
lines changed

5 files changed

+269
-171
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17166,7 +17166,7 @@ bool RISCVTargetLowering::allowsMisalignedMemoryAccesses(
1716617166
unsigned *Fast) const {
1716717167
if (!VT.isVector()) {
1716817168
if (Fast)
17169-
*Fast = 0;
17169+
*Fast = Subtarget.enableUnalignedScalarMem();
1717017170
return Subtarget.enableUnalignedScalarMem();
1717117171
}
1717217172

@@ -17183,7 +17183,7 @@ bool RISCVTargetLowering::allowsMisalignedMemoryAccesses(
1718317183
// misaligned accesses. TODO: Work through the codegen implications of
1718417184
// allowing such accesses to be formed, and considered fast.
1718517185
if (Fast)
17186-
*Fast = 0;
17186+
*Fast = Subtarget.enableUnalignedVectorMem();
1718717187
return Subtarget.enableUnalignedVectorMem();
1718817188
}
1718917189

llvm/test/CodeGen/RISCV/memcpy-inline.ll

Lines changed: 111 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -21,49 +21,77 @@
2121
@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
2222

2323
define i32 @t0() {
24-
; RV32-BOTH-LABEL: t0:
25-
; RV32-BOTH: # %bb.0: # %entry
26-
; RV32-BOTH-NEXT: lui a0, %hi(src)
27-
; RV32-BOTH-NEXT: lw a1, %lo(src)(a0)
28-
; RV32-BOTH-NEXT: lui a2, %hi(dst)
29-
; RV32-BOTH-NEXT: sw a1, %lo(dst)(a2)
30-
; RV32-BOTH-NEXT: addi a0, a0, %lo(src)
31-
; RV32-BOTH-NEXT: lbu a1, 10(a0)
32-
; RV32-BOTH-NEXT: lh a3, 8(a0)
33-
; RV32-BOTH-NEXT: lw a0, 4(a0)
34-
; RV32-BOTH-NEXT: addi a2, a2, %lo(dst)
35-
; RV32-BOTH-NEXT: sb a1, 10(a2)
36-
; RV32-BOTH-NEXT: sh a3, 8(a2)
37-
; RV32-BOTH-NEXT: sw a0, 4(a2)
38-
; RV32-BOTH-NEXT: li a0, 0
39-
; RV32-BOTH-NEXT: ret
24+
; RV32-LABEL: t0:
25+
; RV32: # %bb.0: # %entry
26+
; RV32-NEXT: lui a0, %hi(src)
27+
; RV32-NEXT: lw a1, %lo(src)(a0)
28+
; RV32-NEXT: lui a2, %hi(dst)
29+
; RV32-NEXT: sw a1, %lo(dst)(a2)
30+
; RV32-NEXT: addi a0, a0, %lo(src)
31+
; RV32-NEXT: lbu a1, 10(a0)
32+
; RV32-NEXT: lh a3, 8(a0)
33+
; RV32-NEXT: lw a0, 4(a0)
34+
; RV32-NEXT: addi a2, a2, %lo(dst)
35+
; RV32-NEXT: sb a1, 10(a2)
36+
; RV32-NEXT: sh a3, 8(a2)
37+
; RV32-NEXT: sw a0, 4(a2)
38+
; RV32-NEXT: li a0, 0
39+
; RV32-NEXT: ret
4040
;
41-
; RV64-BOTH-LABEL: t0:
42-
; RV64-BOTH: # %bb.0: # %entry
43-
; RV64-BOTH-NEXT: lui a0, %hi(src)
44-
; RV64-BOTH-NEXT: ld a1, %lo(src)(a0)
45-
; RV64-BOTH-NEXT: lui a2, %hi(dst)
46-
; RV64-BOTH-NEXT: addi a0, a0, %lo(src)
47-
; RV64-BOTH-NEXT: lbu a3, 10(a0)
48-
; RV64-BOTH-NEXT: lh a0, 8(a0)
49-
; RV64-BOTH-NEXT: sd a1, %lo(dst)(a2)
50-
; RV64-BOTH-NEXT: addi a1, a2, %lo(dst)
51-
; RV64-BOTH-NEXT: sb a3, 10(a1)
52-
; RV64-BOTH-NEXT: sh a0, 8(a1)
53-
; RV64-BOTH-NEXT: li a0, 0
54-
; RV64-BOTH-NEXT: ret
41+
; RV64-LABEL: t0:
42+
; RV64: # %bb.0: # %entry
43+
; RV64-NEXT: lui a0, %hi(src)
44+
; RV64-NEXT: ld a1, %lo(src)(a0)
45+
; RV64-NEXT: lui a2, %hi(dst)
46+
; RV64-NEXT: addi a0, a0, %lo(src)
47+
; RV64-NEXT: lbu a3, 10(a0)
48+
; RV64-NEXT: lh a0, 8(a0)
49+
; RV64-NEXT: sd a1, %lo(dst)(a2)
50+
; RV64-NEXT: addi a1, a2, %lo(dst)
51+
; RV64-NEXT: sb a3, 10(a1)
52+
; RV64-NEXT: sh a0, 8(a1)
53+
; RV64-NEXT: li a0, 0
54+
; RV64-NEXT: ret
55+
;
56+
; RV32-FAST-LABEL: t0:
57+
; RV32-FAST: # %bb.0: # %entry
58+
; RV32-FAST-NEXT: lui a0, %hi(src)
59+
; RV32-FAST-NEXT: lw a1, %lo(src)(a0)
60+
; RV32-FAST-NEXT: lui a2, %hi(dst)
61+
; RV32-FAST-NEXT: addi a0, a0, %lo(src)
62+
; RV32-FAST-NEXT: lw a3, 7(a0)
63+
; RV32-FAST-NEXT: lw a0, 4(a0)
64+
; RV32-FAST-NEXT: sw a1, %lo(dst)(a2)
65+
; RV32-FAST-NEXT: addi a1, a2, %lo(dst)
66+
; RV32-FAST-NEXT: sw a3, 7(a1)
67+
; RV32-FAST-NEXT: sw a0, 4(a1)
68+
; RV32-FAST-NEXT: li a0, 0
69+
; RV32-FAST-NEXT: ret
70+
;
71+
; RV64-FAST-LABEL: t0:
72+
; RV64-FAST: # %bb.0: # %entry
73+
; RV64-FAST-NEXT: lui a0, %hi(src)
74+
; RV64-FAST-NEXT: ld a1, %lo(src)(a0)
75+
; RV64-FAST-NEXT: addi a0, a0, %lo(src)
76+
; RV64-FAST-NEXT: lw a0, 7(a0)
77+
; RV64-FAST-NEXT: lui a2, %hi(dst)
78+
; RV64-FAST-NEXT: sd a1, %lo(dst)(a2)
79+
; RV64-FAST-NEXT: addi a1, a2, %lo(dst)
80+
; RV64-FAST-NEXT: sw a0, 7(a1)
81+
; RV64-FAST-NEXT: li a0, 0
82+
; RV64-FAST-NEXT: ret
5583
entry:
5684
call void @llvm.memcpy.p0.p0.i32(ptr align 8 @dst, ptr align 8 @src, i32 11, i1 false)
5785
ret i32 0
5886
}
5987

6088
define void @t1(ptr nocapture %C) nounwind {
61-
; RV32-BOTH-LABEL: t1:
62-
; RV32-BOTH: # %bb.0: # %entry
63-
; RV32-BOTH-NEXT: lui a1, %hi(.L.str1)
64-
; RV32-BOTH-NEXT: addi a1, a1, %lo(.L.str1)
65-
; RV32-BOTH-NEXT: li a2, 31
66-
; RV32-BOTH-NEXT: tail memcpy@plt
89+
; RV32-LABEL: t1:
90+
; RV32: # %bb.0: # %entry
91+
; RV32-NEXT: lui a1, %hi(.L.str1)
92+
; RV32-NEXT: addi a1, a1, %lo(.L.str1)
93+
; RV32-NEXT: li a2, 31
94+
; RV32-NEXT: tail memcpy@plt
6795
;
6896
; RV64-LABEL: t1:
6997
; RV64: # %bb.0: # %entry
@@ -72,22 +100,45 @@ define void @t1(ptr nocapture %C) nounwind {
72100
; RV64-NEXT: li a2, 31
73101
; RV64-NEXT: tail memcpy@plt
74102
;
103+
; RV32-FAST-LABEL: t1:
104+
; RV32-FAST: # %bb.0: # %entry
105+
; RV32-FAST-NEXT: lui a1, 1141
106+
; RV32-FAST-NEXT: addi a1, a1, -439
107+
; RV32-FAST-NEXT: sw a1, 27(a0)
108+
; RV32-FAST-NEXT: lui a1, 300325
109+
; RV32-FAST-NEXT: addi a1, a1, 1107
110+
; RV32-FAST-NEXT: sw a1, 24(a0)
111+
; RV32-FAST-NEXT: lui a1, 132181
112+
; RV32-FAST-NEXT: addi a1, a1, -689
113+
; RV32-FAST-NEXT: sw a1, 20(a0)
114+
; RV32-FAST-NEXT: lui a1, 340483
115+
; RV32-FAST-NEXT: addi a1, a1, -947
116+
; RV32-FAST-NEXT: sw a1, 16(a0)
117+
; RV32-FAST-NEXT: lui a1, 267556
118+
; RV32-FAST-NEXT: addi a1, a1, 1871
119+
; RV32-FAST-NEXT: sw a1, 12(a0)
120+
; RV32-FAST-NEXT: lui a1, 337154
121+
; RV32-FAST-NEXT: addi a1, a1, 69
122+
; RV32-FAST-NEXT: sw a1, 8(a0)
123+
; RV32-FAST-NEXT: lui a1, 320757
124+
; RV32-FAST-NEXT: addi a1, a1, 1107
125+
; RV32-FAST-NEXT: sw a1, 4(a0)
126+
; RV32-FAST-NEXT: lui a1, 365861
127+
; RV32-FAST-NEXT: addi a1, a1, -1980
128+
; RV32-FAST-NEXT: sw a1, 0(a0)
129+
; RV32-FAST-NEXT: ret
130+
;
75131
; RV64-FAST-LABEL: t1:
76132
; RV64-FAST: # %bb.0: # %entry
77133
; RV64-FAST-NEXT: lui a1, %hi(.L.str1)
78134
; RV64-FAST-NEXT: ld a2, %lo(.L.str1)(a1)
79-
; RV64-FAST-NEXT: sd a2, 0(a0)
80-
; RV64-FAST-NEXT: lui a2, 4
81-
; RV64-FAST-NEXT: addiw a2, a2, 1870
82-
; RV64-FAST-NEXT: sh a2, 28(a0)
83-
; RV64-FAST-NEXT: lui a2, 300325
84-
; RV64-FAST-NEXT: addiw a2, a2, 1107
85135
; RV64-FAST-NEXT: addi a1, a1, %lo(.L.str1)
86-
; RV64-FAST-NEXT: ld a3, 16(a1)
136+
; RV64-FAST-NEXT: ld a3, 23(a1)
137+
; RV64-FAST-NEXT: ld a4, 16(a1)
87138
; RV64-FAST-NEXT: ld a1, 8(a1)
88-
; RV64-FAST-NEXT: sw a2, 24(a0)
89-
; RV64-FAST-NEXT: sb zero, 30(a0)
90-
; RV64-FAST-NEXT: sd a3, 16(a0)
139+
; RV64-FAST-NEXT: sd a2, 0(a0)
140+
; RV64-FAST-NEXT: sd a3, 23(a0)
141+
; RV64-FAST-NEXT: sd a4, 16(a0)
91142
; RV64-FAST-NEXT: sd a1, 8(a0)
92143
; RV64-FAST-NEXT: ret
93144
entry:
@@ -270,21 +321,19 @@ define void @t5(ptr nocapture %C) nounwind {
270321
;
271322
; RV32-FAST-LABEL: t5:
272323
; RV32-FAST: # %bb.0: # %entry
273-
; RV32-FAST-NEXT: sb zero, 6(a0)
274-
; RV32-FAST-NEXT: lui a1, 5
275-
; RV32-FAST-NEXT: addi a1, a1, 1107
276-
; RV32-FAST-NEXT: sh a1, 4(a0)
324+
; RV32-FAST-NEXT: lui a1, 1349
325+
; RV32-FAST-NEXT: addi a1, a1, 857
326+
; RV32-FAST-NEXT: sw a1, 3(a0)
277327
; RV32-FAST-NEXT: lui a1, 365861
278328
; RV32-FAST-NEXT: addi a1, a1, -1980
279329
; RV32-FAST-NEXT: sw a1, 0(a0)
280330
; RV32-FAST-NEXT: ret
281331
;
282332
; RV64-FAST-LABEL: t5:
283333
; RV64-FAST: # %bb.0: # %entry
284-
; RV64-FAST-NEXT: sb zero, 6(a0)
285-
; RV64-FAST-NEXT: lui a1, 5
286-
; RV64-FAST-NEXT: addiw a1, a1, 1107
287-
; RV64-FAST-NEXT: sh a1, 4(a0)
334+
; RV64-FAST-NEXT: lui a1, 1349
335+
; RV64-FAST-NEXT: addiw a1, a1, 857
336+
; RV64-FAST-NEXT: sw a1, 3(a0)
288337
; RV64-FAST-NEXT: lui a1, 365861
289338
; RV64-FAST-NEXT: addiw a1, a1, -1980
290339
; RV64-FAST-NEXT: sw a1, 0(a0)
@@ -342,14 +391,12 @@ define void @t6() nounwind {
342391
; RV64-FAST-LABEL: t6:
343392
; RV64-FAST: # %bb.0: # %entry
344393
; RV64-FAST-NEXT: lui a0, %hi(.L.str6)
345-
; RV64-FAST-NEXT: ld a0, %lo(.L.str6)(a0)
346-
; RV64-FAST-NEXT: lui a1, %hi(spool.splbuf)
347-
; RV64-FAST-NEXT: li a2, 88
348-
; RV64-FAST-NEXT: sh a2, %lo(spool.splbuf+12)(a1)
349-
; RV64-FAST-NEXT: sd a0, %lo(spool.splbuf)(a1)
350-
; RV64-FAST-NEXT: lui a0, 361862
351-
; RV64-FAST-NEXT: addiw a0, a0, -1960
352-
; RV64-FAST-NEXT: sw a0, %lo(spool.splbuf+8)(a1)
394+
; RV64-FAST-NEXT: ld a1, %lo(.L.str6)(a0)
395+
; RV64-FAST-NEXT: addi a0, a0, %lo(.L.str6)
396+
; RV64-FAST-NEXT: ld a0, 6(a0)
397+
; RV64-FAST-NEXT: lui a2, %hi(spool.splbuf)
398+
; RV64-FAST-NEXT: sd a1, %lo(spool.splbuf)(a2)
399+
; RV64-FAST-NEXT: sd a0, %lo(spool.splbuf+6)(a2)
353400
; RV64-FAST-NEXT: ret
354401
entry:
355402
call void @llvm.memcpy.p0.p0.i64(ptr @spool.splbuf, ptr @.str6, i64 14, i1 false)
@@ -397,3 +444,5 @@ entry:
397444

398445
declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind
399446
declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
447+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
448+
; RV64-BOTH: {{.*}}

llvm/test/CodeGen/RISCV/memset-inline.ll

Lines changed: 53 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,40 +1248,66 @@ define void @aligned_bzero_64(ptr %a) nounwind {
12481248
; Usual overlap tricks
12491249

12501250
define void @aligned_bzero_7(ptr %a) nounwind {
1251-
; RV32-BOTH-LABEL: aligned_bzero_7:
1252-
; RV32-BOTH: # %bb.0:
1253-
; RV32-BOTH-NEXT: sb zero, 6(a0)
1254-
; RV32-BOTH-NEXT: sh zero, 4(a0)
1255-
; RV32-BOTH-NEXT: sw zero, 0(a0)
1256-
; RV32-BOTH-NEXT: ret
1251+
; RV32-LABEL: aligned_bzero_7:
1252+
; RV32: # %bb.0:
1253+
; RV32-NEXT: sb zero, 6(a0)
1254+
; RV32-NEXT: sh zero, 4(a0)
1255+
; RV32-NEXT: sw zero, 0(a0)
1256+
; RV32-NEXT: ret
12571257
;
1258-
; RV64-BOTH-LABEL: aligned_bzero_7:
1259-
; RV64-BOTH: # %bb.0:
1260-
; RV64-BOTH-NEXT: sb zero, 6(a0)
1261-
; RV64-BOTH-NEXT: sh zero, 4(a0)
1262-
; RV64-BOTH-NEXT: sw zero, 0(a0)
1263-
; RV64-BOTH-NEXT: ret
1258+
; RV64-LABEL: aligned_bzero_7:
1259+
; RV64: # %bb.0:
1260+
; RV64-NEXT: sb zero, 6(a0)
1261+
; RV64-NEXT: sh zero, 4(a0)
1262+
; RV64-NEXT: sw zero, 0(a0)
1263+
; RV64-NEXT: ret
1264+
;
1265+
; RV32-FAST-LABEL: aligned_bzero_7:
1266+
; RV32-FAST: # %bb.0:
1267+
; RV32-FAST-NEXT: sw zero, 3(a0)
1268+
; RV32-FAST-NEXT: sw zero, 0(a0)
1269+
; RV32-FAST-NEXT: ret
1270+
;
1271+
; RV64-FAST-LABEL: aligned_bzero_7:
1272+
; RV64-FAST: # %bb.0:
1273+
; RV64-FAST-NEXT: sw zero, 3(a0)
1274+
; RV64-FAST-NEXT: sw zero, 0(a0)
1275+
; RV64-FAST-NEXT: ret
12641276
tail call void @llvm.memset.inline.p0.i64(ptr align 8 %a, i8 0, i64 7, i1 0)
12651277
ret void
12661278
}
12671279

12681280
define void @aligned_bzero_15(ptr %a) nounwind {
1269-
; RV32-BOTH-LABEL: aligned_bzero_15:
1270-
; RV32-BOTH: # %bb.0:
1271-
; RV32-BOTH-NEXT: sb zero, 14(a0)
1272-
; RV32-BOTH-NEXT: sh zero, 12(a0)
1273-
; RV32-BOTH-NEXT: sw zero, 8(a0)
1274-
; RV32-BOTH-NEXT: sw zero, 4(a0)
1275-
; RV32-BOTH-NEXT: sw zero, 0(a0)
1276-
; RV32-BOTH-NEXT: ret
1281+
; RV32-LABEL: aligned_bzero_15:
1282+
; RV32: # %bb.0:
1283+
; RV32-NEXT: sb zero, 14(a0)
1284+
; RV32-NEXT: sh zero, 12(a0)
1285+
; RV32-NEXT: sw zero, 8(a0)
1286+
; RV32-NEXT: sw zero, 4(a0)
1287+
; RV32-NEXT: sw zero, 0(a0)
1288+
; RV32-NEXT: ret
12771289
;
1278-
; RV64-BOTH-LABEL: aligned_bzero_15:
1279-
; RV64-BOTH: # %bb.0:
1280-
; RV64-BOTH-NEXT: sb zero, 14(a0)
1281-
; RV64-BOTH-NEXT: sh zero, 12(a0)
1282-
; RV64-BOTH-NEXT: sw zero, 8(a0)
1283-
; RV64-BOTH-NEXT: sd zero, 0(a0)
1284-
; RV64-BOTH-NEXT: ret
1290+
; RV64-LABEL: aligned_bzero_15:
1291+
; RV64: # %bb.0:
1292+
; RV64-NEXT: sb zero, 14(a0)
1293+
; RV64-NEXT: sh zero, 12(a0)
1294+
; RV64-NEXT: sw zero, 8(a0)
1295+
; RV64-NEXT: sd zero, 0(a0)
1296+
; RV64-NEXT: ret
1297+
;
1298+
; RV32-FAST-LABEL: aligned_bzero_15:
1299+
; RV32-FAST: # %bb.0:
1300+
; RV32-FAST-NEXT: sw zero, 11(a0)
1301+
; RV32-FAST-NEXT: sw zero, 8(a0)
1302+
; RV32-FAST-NEXT: sw zero, 4(a0)
1303+
; RV32-FAST-NEXT: sw zero, 0(a0)
1304+
; RV32-FAST-NEXT: ret
1305+
;
1306+
; RV64-FAST-LABEL: aligned_bzero_15:
1307+
; RV64-FAST: # %bb.0:
1308+
; RV64-FAST-NEXT: sd zero, 7(a0)
1309+
; RV64-FAST-NEXT: sd zero, 0(a0)
1310+
; RV64-FAST-NEXT: ret
12851311
tail call void @llvm.memset.inline.p0.i64(ptr align 8 %a, i8 0, i64 15, i1 0)
12861312
ret void
12871313
}

0 commit comments

Comments
 (0)