Skip to content

Commit 472fcee

Browse files
Mike PallBuristan
Mike Pall
authored andcommitted
x86/x64: Add more red zone checks to assembler backend.
Thanks to Peter Cawley. (cherry picked from commit d854d00) Assembling some instructions (like `IR_CONV int.num`, for example) with many mcode to be emitted may overflow the `MCLIM_REDZONE` (64) at once due to the huge mcode emitting. For example `IR_CONV` in this test requires 66 bytes of the machine code: | cvttsd2si r15d, xmm5 | xorps xmm9, xmm9 | cvtsi2sd xmm9, r15d | ucomisd xmm5, xmm9 | jnz 0x11edb00e5 ->37 | jpe 0x11edb00e5 ->37 | mov [rsp+0x80], r15d | mov r15, [rsp+0xe8] | movsd xmm9, [rsp+0xe0] | movsd xmm5, [rsp+0xd8] The reproducer needs sufficient register pressure as to immediately spill the result of the instruction to the stack and then reload the three registers used by the instruction, and to have chosen enough registers with numbers >=8 (because shaving off a REX prefix [1] or two would get 66 back down to <= `MCLIM_REDZONE`), and to be using lots of spill slots (because memory offsets <= 0x7f are shorter to encode compared to those >= 0x80. So, each reload instruction consumes 9 bytes. This makes this reproducer unstable (regarding the register allocator changes). Thus, only original test case is added as a regression test. This patch adds the red zone overflow checks more often for the IRs with many instructions to be emitted. Sergey Kaplun: * added the description and the test for the problem [1]: https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix Part of tarantool/tarantool#10709
1 parent ffede1b commit 472fcee

File tree

2 files changed

+124
-2
lines changed

2 files changed

+124
-2
lines changed

src/lj_asm_x86.h

+6-2
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,7 @@ static void asm_tointg(ASMState *as, IRIns *ir, Reg left)
794794
emit_rr(as, XO_UCOMISD, left, tmp);
795795
emit_rr(as, XO_CVTSI2SD, tmp, dest);
796796
emit_rr(as, XO_XORPS, tmp, tmp); /* Avoid partial register stall. */
797+
checkmclim(as);
797798
emit_rr(as, XO_CVTTSD2SI, dest, left);
798799
/* Can't fuse since left is needed twice. */
799800
}
@@ -836,6 +837,7 @@ static void asm_conv(ASMState *as, IRIns *ir)
836837
emit_rr(as, XO_SUBSD, dest, bias); /* Subtract 2^52+2^51 bias. */
837838
emit_rr(as, XO_XORPS, dest, bias); /* Merge bias and integer. */
838839
emit_rma(as, XO_MOVSD, bias, k);
840+
checkmclim(as);
839841
emit_mrm(as, XO_MOVD, dest, asm_fuseload(as, lref, RSET_GPR));
840842
return;
841843
} else { /* Integer to FP conversion. */
@@ -1151,6 +1153,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
11511153
asm_guardcc(as, CC_E);
11521154
else
11531155
emit_sjcc(as, CC_E, l_end);
1156+
checkmclim(as);
11541157
if (irt_isnum(kt)) {
11551158
if (isk) {
11561159
/* Assumes -0.0 is already canonicalized to +0.0. */
@@ -1210,7 +1213,6 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
12101213
#endif
12111214
}
12121215
emit_sfixup(as, l_loop);
1213-
checkmclim(as);
12141216
#if LJ_GC64
12151217
if (!isk && irt_isaddr(kt)) {
12161218
emit_rr(as, XO_OR, tmp|REX_64, key);
@@ -1242,6 +1244,7 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
12421244
emit_rr(as, XO_ARITH(XOg_SUB), dest, tmp);
12431245
emit_shifti(as, XOg_ROL, tmp, HASH_ROT3);
12441246
emit_rr(as, XO_ARITH(XOg_XOR), dest, tmp);
1247+
checkmclim(as);
12451248
emit_shifti(as, XOg_ROL, dest, HASH_ROT2);
12461249
emit_rr(as, XO_ARITH(XOg_SUB), tmp, dest);
12471250
emit_shifti(as, XOg_ROL, dest, HASH_ROT1);
@@ -1259,7 +1262,6 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
12591262
} else {
12601263
emit_rr(as, XO_MOV, tmp, key);
12611264
#if LJ_GC64
1262-
checkmclim(as);
12631265
emit_gri(as, XG_ARITHi(XOg_XOR), dest, irt_toitype(kt) << 15);
12641266
if ((as->flags & JIT_F_BMI2)) {
12651267
emit_i8(as, 32);
@@ -1530,6 +1532,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
15301532
if (irt_islightud(ir->t)) {
15311533
Reg dest = asm_load_lightud64(as, ir, 1);
15321534
if (ra_hasreg(dest)) {
1535+
checkmclim(as);
15331536
asm_fuseahuref(as, ir->op1, RSET_GPR);
15341537
emit_mrm(as, XO_MOV, dest|REX_64, RID_MRM);
15351538
}
@@ -1574,6 +1577,7 @@ static void asm_ahuvload(ASMState *as, IRIns *ir)
15741577
if (LJ_64 && irt_type(ir->t) >= IRT_NUM) {
15751578
lj_assertA(irt_isinteger(ir->t) || irt_isnum(ir->t),
15761579
"bad load type %d", irt_type(ir->t));
1580+
checkmclim(as);
15771581
#if LJ_GC64
15781582
emit_u32(as, LJ_TISNUM << 15);
15791583
#else
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
local tap = require('tap')
2+
-- Test file to demonstrate mcode area overflow during recording a
3+
-- trace with the high FPR pressure.
4+
-- See also, https://github.com/LuaJIT/LuaJIT/issues/1116.
5+
--
6+
-- XXX: Test fails only with GC64 enabled before the commit.
7+
local test = tap.test('lj-1116-redzones-checks'):skipcond({
8+
['Test requires JIT enabled'] = not jit.status(),
9+
})
10+
11+
test:plan(1)
12+
13+
jit.opt.start('hotloop=1')
14+
15+
-- XXX: This test snippet was originally created by the fuzzer.
16+
-- See https://oss-fuzz.com/testcase-detail/5622965122170880.
17+
--
18+
-- Unfortunately, it's impossible to reduce the testcase further.
19+
-- Before the patch, assembling some instructions (like `IR_CONV
20+
-- int.num`, for example) with many mcode to be emitted may
21+
-- overflow the `MCLIM_REDZONE` (64) at once due to the huge
22+
-- mcode emitting.
23+
-- For example `IR_CONV` in this test requires 66 bytes of the
24+
-- machine code:
25+
-- | cvttsd2si r15d, xmm5
26+
-- | xorps xmm9, xmm9
27+
-- | cvtsi2sd xmm9, r15d
28+
-- | ucomisd xmm5, xmm9
29+
-- | jnz 0x11edb00e5 ->37
30+
-- | jpe 0x11edb00e5 ->37
31+
-- | mov [rsp+0x80], r15d
32+
-- | mov r15, [rsp+0xe8]
33+
-- | movsd xmm9, [rsp+0xe0]
34+
-- | movsd xmm5, [rsp+0xd8]
35+
--
36+
-- The reproducer needs sufficient register pressure as to
37+
-- immediately spill the result of the instruction to the stack
38+
-- and then reload the three registers used by the instruction,
39+
-- and to have chosen enough registers with numbers >=8 (because
40+
-- shaving off a REX prefix [1] or two would get 66 back down
41+
-- to <= `MCLIM_REDZONE`), and to be using lots of spill slots
42+
-- (because memory offsets <= 0x7f are shorter to encode compared
43+
-- to those >= 0x80. So, each reload instruction consumes 9 bytes.
44+
-- This makes this reproducer unstable (regarding the register
45+
-- allocator changes). So, lets use this as a regression test.
46+
--
47+
-- [1]: https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix
48+
49+
_G.a = 0
50+
_G.b = 0
51+
_G.c = 0
52+
_G.d = 0
53+
_G.e = 0
54+
_G.f = 0
55+
_G.g = 0
56+
_G.h = 0
57+
-- Skip `i`.
58+
_G.j = 0
59+
_G.k = 0
60+
_G.l = 0
61+
_G.m = 0
62+
_G.n = 0
63+
_G.o = 0
64+
_G.p = 0
65+
_G.q = 0
66+
_G.r = 0
67+
_G.s = 0
68+
_G.t = 0
69+
_G.u = 0
70+
_G.v = 0
71+
_G.w = 0
72+
_G.x = 0
73+
_G.y = 0
74+
_G.z = 0
75+
76+
-- XXX: Need here not 4, but 4.5 top border of the cycle to create
77+
-- FPR pressure.
78+
for i = 1, 4.5 do
79+
_G.a = _G.a + 1
80+
_G.b = _G.b + 1
81+
_G.c = _G.c + 1
82+
_G.d = _G.d + 1
83+
for _ = i, 2 do
84+
_G.e = _G.e + 1
85+
end
86+
-- Here we emit `IR_CONV int.num`. This loop is inlined.
87+
-- Assertion failed after emitting the variant part of the
88+
-- big loop.
89+
for _ = 2, i do
90+
_G.f = _G.f + 1
91+
_G.g = _G.g + 1
92+
_G.h = _G.h + 1
93+
_G.j = _G.j + 1
94+
_G.k = _G.k + 1
95+
_G.l = _G.l + 1
96+
end
97+
_G.m = _G.m + 1
98+
_G.n = _G.n + 1
99+
_G.o = _G.o + 1
100+
_G.p = _G.p + 1
101+
_G.q = _G.q + 1
102+
_G.r = _G.r + 1
103+
_G.s = _G.s + 1
104+
_G.t = _G.t + 1
105+
_G.u = _G.u + 1
106+
_G.v = _G.v + 1
107+
for _ = i, 2.1 do
108+
_G.aa = _G.a
109+
_G.w = _G.w + 1
110+
_G.x = _G.x + 1
111+
_G.y = _G.y + 1
112+
_G.z = _G.z + 1
113+
end
114+
end
115+
116+
test:ok(true, 'no mcode limit assertion failed during recording')
117+
118+
test:done(true)

0 commit comments

Comments
 (0)