Skip to content

Commit

Permalink
Implement -codegen-fuse-op-and-check true for C codegen
Browse files Browse the repository at this point in the history
It appears that GCC (and, to a lesser extent) Clang/LLVM do not always
successfully fuse adjacent `Word<N>_<op>` and
`Word{S,U}<N>_<op>CheckP` primitives.  The performance results
reported at MLton#273 and
MLton#292 suggest that this does not
always have significant impact, but a close look at the `md5`
benchmark shows that the native codegen significantly outperforms the
C codegen with gcc-9 due to redundant arithmetic computations (one for
`Word{S,U}<N>_<op>CheckP` and another for `Word<N>_<op>`).

(Note: Because the final md5 state is not used by the `md5` benchmark
program, MLton actually optimizes out most of the md5 computation.
What is left is a lot of arithmetic from `PackWord32Little.subVec` to
check for indices that should raise `Subscript`.)

For example, with `-codegen-fuse-op-and-check false` and gcc-9, the
`transform` function of `md5` has the following assembly:

	movl	%r9d, %r10d
	subl	$1, %r10d
	jo	.L650
	leal	-1(%r8), %r10d
	movl	%r10d, %r12d
	addl	%r10d, %edx
	jo	.L650
	addl	%r10d, %r11d
	cmpl	%eax, %r11d
	jnb	.L656
	movl	%ebp, %edx
	addl	$1, %edx
	jo	.L659
	leal	1(%rcx), %edx
	movl	%edx, %r11d
	imull	%r9d, %r11d
	jo	.L650
	imull	%r8d, %edx
	movl	%edx, %r11d
	addl	%r10d, %r11d
	jo	.L650
	leal	(%rdx,%r10), %r11d
	cmpl	%eax, %r11d
	jnb	.L665

What seems to have happened is that gcc has arranged for equivalent
values to be in `%r8` and `%r9`.  In the first three lines, there is
an implementation of `WordS32_subCheckP (X, 1)` using `subl/jo`, while
in the fourth line, there is an implementation of `Word32_sub (X, 1)`
using `lea` with an offset of `-1`.  Notice that `%r10` is used for
the result of both, so the fourth line is redundant (the value is
already in `%r10`).

On the other hand, with `-codegen-fuse-op-and-check true` and gcc-9,
the `transform` function of `md5` has the following assembly:

	movl	%r8d, %r9d
	subl	$1, %r9d
	jo	.L645
	addl	%r9d, %ecx
	jo	.L645
	cmpl	%edx, %ecx
	jnb	.L651
	movl	%eax, %ecx
	addl	$1, %ecx
	jo	.L654
	imull	%r8d, %ecx
	jo	.L645
	addl	%r9d, %ecx
	jo	.L645
	cmpl	%edx, %ecx
	jnb	.L660
  • Loading branch information
MatthewFluet committed Aug 9, 2019
1 parent 3d1e89c commit 68f8512
Showing 1 changed file with 110 additions and 1 deletion.
111 changes: 110 additions & 1 deletion mlton/codegen/c-codegen/c-codegen.fun
Original file line number Diff line number Diff line change
Expand Up @@ -1132,6 +1132,112 @@ fun output {program as Machine.Program.T {chunks, frameInfos, main, ...},
in
outputStatement s
end
fun outputStatementsFuseOpAndChk statements =
Vector.foreachi
(statements, fn (i, s1) =>
let
fun default () = outputStatement s1
in
case s1 of
Statement.PrimApp {args = args1, dst = SOME dst1, prim = prim1} =>
let
fun fuse chk =
(case Vector.sub (statements, i + 1) of
s2 as Statement.PrimApp {args = args2, dst = SOME dst2, prim = prim2} =>
if Vector.equals (args1, args2, Operand.equals)
andalso chk prim2
then let
val name =
String.substituteFirst
(Prim.toString prim2,
{substring = "CheckP",
replacement = "AndCheck"})
val _ =
if !Control.codegenComments > 1
then (print "\t/* "
; print (Layout.toString (Statement.layout s1))
; print " */\n"
; print "\t/* "
; print (Layout.toString (Statement.layout s2))
; print " */\n")
else ()
val _ = print "\t"
val _ =
print (C.call (name,
Vector.toListMap (args1, fetchOperand) @
[addr (operandToString dst1),
addr (operandToString dst2)]))
in
()
end
else default ()
| _ => default ())
handle Subscript => default ()
fun skip chk =
(case Vector.sub (statements, i - 1) of
Statement.PrimApp {args = args2, dst = SOME _, prim = prim2} =>
if Vector.equals (args1, args2, Operand.equals)
andalso chk prim2
then ()
else default ()
| _ => default ())
handle Subscript => default ()
in
case Prim.name prim1 of
Prim.Name.Word_add ws1 =>
fuse (fn prim2 =>
case Prim.name prim2 of
Prim.Name.Word_addCheckP (ws2, _) =>
WordSize.equals (ws1, ws2)
| _ => false)
| Prim.Name.Word_addCheckP (ws1, _) =>
skip (fn prim2 =>
case Prim.name prim2 of
Prim.Name.Word_add ws2 =>
WordSize.equals (ws1, ws2)
| _ => false)
| Prim.Name.Word_mul (ws1, {signed = signed1}) =>
fuse (fn prim2 =>
case Prim.name prim2 of
Prim.Name.Word_mulCheckP (ws2, {signed = signed2}) =>
WordSize.equals (ws1, ws2)
andalso Bool.equals (signed1, signed2)
| _ => false)
| Prim.Name.Word_mulCheckP (ws1, {signed = signed1}) =>
skip (fn prim2 =>
case Prim.name prim2 of
Prim.Name.Word_mul (ws2, {signed = signed2}) =>
WordSize.equals (ws1, ws2)
andalso Bool.equals (signed1, signed2)
| _ => false)
| Prim.Name.Word_neg ws1 =>
fuse (fn prim2 =>
case Prim.name prim2 of
Prim.Name.Word_negCheckP (ws2, _) =>
WordSize.equals (ws1, ws2)
| _ => false)
| Prim.Name.Word_negCheckP (ws1, _) =>
skip (fn prim2 =>
case Prim.name prim2 of
Prim.Name.Word_neg ws2 =>
WordSize.equals (ws1, ws2)
| _ => false)
| Prim.Name.Word_sub ws1 =>
fuse (fn prim2 =>
case Prim.name prim2 of
Prim.Name.Word_subCheckP (ws2, _) =>
WordSize.equals (ws1, ws2)
| _ => false)
| Prim.Name.Word_subCheckP (ws1, _) =>
skip (fn prim2 =>
case Prim.name prim2 of
Prim.Name.Word_sub ws2 =>
WordSize.equals (ws1, ws2)
| _ => false)
| _ => default ()
end
| _ => default ()
end)
fun outputBlock (Block.T {kind, label, statements, transfer, ...}) =
let
val _ = prints [Label.toString label, ":\n"]
Expand All @@ -1157,7 +1263,10 @@ fun output {program as Machine.Program.T {chunks, frameInfos, main, ...},
| Kind.Func _ => ()
| Kind.Handler {frameInfo, ...} => pop frameInfo
| Kind.Jump => ()
val _ = Vector.foreach (statements, outputStatement)
val _ =
if !Control.codegenFuseOpAndChk
then outputStatementsFuseOpAndChk statements
else Vector.foreach (statements, outputStatement)
val _ = outputTransfer transfer
val _ = print "\n"
in
Expand Down

0 comments on commit 68f8512

Please sign in to comment.