Skip to content

Commit 68f8512

Browse files
committed
Implement -codegen-fuse-op-and-check true for C codegen
It appears that GCC (and, to a lesser extent) Clang/LLVM do not always successfully fuse adjacent `Word<N>_<op>` and `Word{S,U}<N>_<op>CheckP` primitives. The performance results reported at MLton#273 and MLton#292 suggest that this does not always have significant impact, but a close look at the `md5` benchmark shows that the native codegen significantly outperforms the C codegen with gcc-9 due to redundant arithmetic computations (one for `Word{S,U}<N>_<op>CheckP` and another for `Word<N>_<op>`). (Note: Because the final md5 state is not used by the `md5` benchmark program, MLton actually optimizes out most of the md5 computation. What is left is a lot of arithmetic from `PackWord32Little.subVec` to check for indices that should raise `Subscript`.) For example, with `-codegen-fuse-op-and-check false` and gcc-9, the `transform` function of `md5` has the following assembly: movl %r9d, %r10d subl $1, %r10d jo .L650 leal -1(%r8), %r10d movl %r10d, %r12d addl %r10d, %edx jo .L650 addl %r10d, %r11d cmpl %eax, %r11d jnb .L656 movl %ebp, %edx addl $1, %edx jo .L659 leal 1(%rcx), %edx movl %edx, %r11d imull %r9d, %r11d jo .L650 imull %r8d, %edx movl %edx, %r11d addl %r10d, %r11d jo .L650 leal (%rdx,%r10), %r11d cmpl %eax, %r11d jnb .L665 What seems to have happened is that gcc has arranged for equivalent values to be in `%r8` and `%r9`. In the first three lines, there is an implementation of `WordS32_subCheckP (X, 1)` using `subl/jo`, while in the fourth line, there is an implementation of `Word32_sub (X, 1)` using `lea` with an offset of `-1`. Notice that `%r10` is used for the result of both, so the fourth line is redundant (the value is already in `%r10`). On the other hand, with `-codegen-fuse-op-and-check true` and gcc-9, the `transform` function of `md5` has the following assembly: movl %r8d, %r9d subl $1, %r9d jo .L645 addl %r9d, %ecx jo .L645 cmpl %edx, %ecx jnb .L651 movl %eax, %ecx addl $1, %ecx jo .L654 imull %r8d, %ecx jo .L645 addl %r9d, %ecx jo .L645 cmpl %edx, %ecx jnb .L660
1 parent 3d1e89c commit 68f8512

File tree

1 file changed

+110
-1
lines changed

1 file changed

+110
-1
lines changed

mlton/codegen/c-codegen/c-codegen.fun

Lines changed: 110 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1132,6 +1132,112 @@ fun output {program as Machine.Program.T {chunks, frameInfos, main, ...},
11321132
in
11331133
outputStatement s
11341134
end
1135+
fun outputStatementsFuseOpAndChk statements =
1136+
Vector.foreachi
1137+
(statements, fn (i, s1) =>
1138+
let
1139+
fun default () = outputStatement s1
1140+
in
1141+
case s1 of
1142+
Statement.PrimApp {args = args1, dst = SOME dst1, prim = prim1} =>
1143+
let
1144+
fun fuse chk =
1145+
(case Vector.sub (statements, i + 1) of
1146+
s2 as Statement.PrimApp {args = args2, dst = SOME dst2, prim = prim2} =>
1147+
if Vector.equals (args1, args2, Operand.equals)
1148+
andalso chk prim2
1149+
then let
1150+
val name =
1151+
String.substituteFirst
1152+
(Prim.toString prim2,
1153+
{substring = "CheckP",
1154+
replacement = "AndCheck"})
1155+
val _ =
1156+
if !Control.codegenComments > 1
1157+
then (print "\t/* "
1158+
; print (Layout.toString (Statement.layout s1))
1159+
; print " */\n"
1160+
; print "\t/* "
1161+
; print (Layout.toString (Statement.layout s2))
1162+
; print " */\n")
1163+
else ()
1164+
val _ = print "\t"
1165+
val _ =
1166+
print (C.call (name,
1167+
Vector.toListMap (args1, fetchOperand) @
1168+
[addr (operandToString dst1),
1169+
addr (operandToString dst2)]))
1170+
in
1171+
()
1172+
end
1173+
else default ()
1174+
| _ => default ())
1175+
handle Subscript => default ()
1176+
fun skip chk =
1177+
(case Vector.sub (statements, i - 1) of
1178+
Statement.PrimApp {args = args2, dst = SOME _, prim = prim2} =>
1179+
if Vector.equals (args1, args2, Operand.equals)
1180+
andalso chk prim2
1181+
then ()
1182+
else default ()
1183+
| _ => default ())
1184+
handle Subscript => default ()
1185+
in
1186+
case Prim.name prim1 of
1187+
Prim.Name.Word_add ws1 =>
1188+
fuse (fn prim2 =>
1189+
case Prim.name prim2 of
1190+
Prim.Name.Word_addCheckP (ws2, _) =>
1191+
WordSize.equals (ws1, ws2)
1192+
| _ => false)
1193+
| Prim.Name.Word_addCheckP (ws1, _) =>
1194+
skip (fn prim2 =>
1195+
case Prim.name prim2 of
1196+
Prim.Name.Word_add ws2 =>
1197+
WordSize.equals (ws1, ws2)
1198+
| _ => false)
1199+
| Prim.Name.Word_mul (ws1, {signed = signed1}) =>
1200+
fuse (fn prim2 =>
1201+
case Prim.name prim2 of
1202+
Prim.Name.Word_mulCheckP (ws2, {signed = signed2}) =>
1203+
WordSize.equals (ws1, ws2)
1204+
andalso Bool.equals (signed1, signed2)
1205+
| _ => false)
1206+
| Prim.Name.Word_mulCheckP (ws1, {signed = signed1}) =>
1207+
skip (fn prim2 =>
1208+
case Prim.name prim2 of
1209+
Prim.Name.Word_mul (ws2, {signed = signed2}) =>
1210+
WordSize.equals (ws1, ws2)
1211+
andalso Bool.equals (signed1, signed2)
1212+
| _ => false)
1213+
| Prim.Name.Word_neg ws1 =>
1214+
fuse (fn prim2 =>
1215+
case Prim.name prim2 of
1216+
Prim.Name.Word_negCheckP (ws2, _) =>
1217+
WordSize.equals (ws1, ws2)
1218+
| _ => false)
1219+
| Prim.Name.Word_negCheckP (ws1, _) =>
1220+
skip (fn prim2 =>
1221+
case Prim.name prim2 of
1222+
Prim.Name.Word_neg ws2 =>
1223+
WordSize.equals (ws1, ws2)
1224+
| _ => false)
1225+
| Prim.Name.Word_sub ws1 =>
1226+
fuse (fn prim2 =>
1227+
case Prim.name prim2 of
1228+
Prim.Name.Word_subCheckP (ws2, _) =>
1229+
WordSize.equals (ws1, ws2)
1230+
| _ => false)
1231+
| Prim.Name.Word_subCheckP (ws1, _) =>
1232+
skip (fn prim2 =>
1233+
case Prim.name prim2 of
1234+
Prim.Name.Word_sub ws2 =>
1235+
WordSize.equals (ws1, ws2)
1236+
| _ => false)
1237+
| _ => default ()
1238+
end
1239+
| _ => default ()
1240+
end)
11351241
fun outputBlock (Block.T {kind, label, statements, transfer, ...}) =
11361242
let
11371243
val _ = prints [Label.toString label, ":\n"]
@@ -1157,7 +1263,10 @@ fun output {program as Machine.Program.T {chunks, frameInfos, main, ...},
11571263
| Kind.Func _ => ()
11581264
| Kind.Handler {frameInfo, ...} => pop frameInfo
11591265
| Kind.Jump => ()
1160-
val _ = Vector.foreach (statements, outputStatement)
1266+
val _ =
1267+
if !Control.codegenFuseOpAndChk
1268+
then outputStatementsFuseOpAndChk statements
1269+
else Vector.foreach (statements, outputStatement)
11611270
val _ = outputTransfer transfer
11621271
val _ = print "\n"
11631272
in

0 commit comments

Comments
 (0)