Skip to content

Commit

Permalink
Merge pull request #351 from MatthewFluet/c-and-llvm-codegen-updates
Browse files Browse the repository at this point in the history
Updates to C and LLVM codegens. Highlights:

* Add `Machine.Program.rflow` to compute `{returns,raises}To` control
  flow (654c557) and use in `functor Chunkify` (1b3b7b8) and in
  Machine IR `Raise/Return` transfers (cf8e487).
* Add `chunk-jump-table {false|true}` compile-time option to force
  generation of a jump table for the chunk switch (8e0dd2d,
  5b6439b, 087a5b1).
* Add `-chunk-{{must,may}-rto-self,must-rto-sing,must-rto-other}-opt`
  compile-time options to optimize return/raise transfers (7c10c70,
  4d5abde, 4b7c649, c3b9905, 473808f)
* Experiment using LLVM's `cc10` (aka, `ghccc`) calling convention
  (2e26ebd).
* Experiment with a new `simple` chunkify strategy (3330cbe,
  3d9c499, 138512f, faef164, d1df0de); generally performs
  about the same as `coalesce4096`, significantly improves `fib` and
  `tak` (for GCC), slightly improves `hamlet`, but slightly worsens
  `raytrace`:

  config command                                                                                                                          
  C04    /home/mtf/devel/mlton/builds/g098009d49/bin/mlton -codegen c -cc gcc-9                                                           
  C05    /home/mtf/devel/mlton/builds/g098009d49/bin/mlton -codegen c -cc gcc-9 -chunkify simple                                          
  C09    /home/mtf/devel/mlton/builds/g098009d49/bin/mlton -codegen llvm -cc clang                                                        
  C10    /home/mtf/devel/mlton/builds/g098009d49/bin/mlton -codegen llvm -cc clang -chunkify simple                                       

  task_clock [email protected] (2-level)
  program           `C05/C04` `C10/C09`
  barnes-hut           0.9978    0.9589
  boyer                1.064     1.076 
  checksum             1.051     0.9775
  count-graphs         1.005     0.9876
  DLXSimulator         1.000     0.9905
  even-odd             1.037     0.9989
  fft                  0.9616    0.9537
  fib                  0.6689    0.6260
  flat-array           1.000     0.9645
  hamlet               0.9547    0.9322
  imp-for              1.067     1.014 
  knuth-bendix         1.092     1.031 
  lexgen               1.031     1.078 
  life                 1.002     0.9911
  logic                1.016     1.015 
  mandelbrot           0.9776    1.030 
  matrix-multiply      0.9903    0.9844
  md5                  1.008     0.9940
  merge                0.9927    1.062 
  mlyacc               0.9810    1.024 
  model-elimination    0.9877    0.9743
  mpuz                 1.011     1.010 
  nucleic              1.036     1.030 
  output1              0.9943    1.021 
  peek                 1.036     1.027 
  pidigits             1.000     0.9653
  psdes-random         1.009     1.014 
  ratio-regions        0.9985    0.9881
  ray                  0.9738    0.9601
  raytrace             1.101     1.100 
  simple               0.9620    0.9272
  smith-normal-form    0.9690    0.9806
  string-concat        0.9610    0.9772
  tailfib              1.006     0.9292
  tailmerge            0.9847    1.023 
  tak                  0.8264    1.013 
  tensor               1.010     0.9998
  tsp                  0.9981    1.010 
  tyan                 1.045     1.027 
  vector-rev           1.012     0.9891
  vector32-concat      0.9495    1.030 
  vector64-concat      1.098     0.9744
  vliw                 0.9413    1.019 
  wc-input1            0.9301    1.098 
  wc-scanStream        1.114     0.9234
  zebra                1.008     1.001 
  zern                 0.9819    1.014 
  MIN                  0.6689    0.6260
  GMEAN                0.9940    0.9912
  MAX                  1.114     1.100 

  The `simple` chunkify strategy is not (yet) suitable for a
  self-compile; it can generate excessively large chunks, including
  one for a self-compile that requires 8min to compile by `gcc`.
* Add `expect: WordX.t option` to RSSA and Machine `Switch.T`
  (911b5d4, e2b27ab, 695320d) and add `-gc-expect
  {none|false|true}` compile-time option, where `-gc-expect false`
  should indicate that performing a GC is cold path (823815a); no
  notable performance impact.
* Lots of tweaks to C codegen, ultimately eliminating almost all
  `c-chunk.h` macros.
* Eliminate unused `Machine.Operand.Contents` constructor (006269b).
* Make a major refactoring of LLVM codegen (cec30c5).
* Implement `Real<N>_qequal` for C codegen (9b7b2bd) and use
  `is{less,lessequal}` for `Real<N>_l{t,e}` for C codegen (7b55819).
* Generalize LLVM type-based alias-analysis (27709ef).
* Add `-llvm-aamd scope` for simple `noalias`/`alias.scope`
  alias-analysis metadata in LLVM codegen (b825f56); no notable
  performance impact.
* Use C99/C11 `inline` for primitive and Basis Library functions
  (311331c, c864492, 4f2d213).
* Add `-codegen-fuse-op-and-chk {false|true}` compile-time option to
  explicitly fuse adjacent `Word<N>_<op>` and
  `Word{S,U}<N>_<op>CheckP` primitives in the C and LLVM codegens
  (6b738b8, 3d1e89c, 68f8512, 82c019f, 61de560, 5363199,
  0d46a85).  It appears that GCC (and, to a lesser extent)
  Clang/LLVM do not always successfully fuse adjacent adjacent
  `Word<N>_<op>` and `Word{S,U}<N>_<op>CheckP` primitives.  The
  performance results reported at
  #273 and
  #292 suggest that this does not
  always have significant impact, but sometimes
  `-codegen-fuse-op-and-chk true` can have a positive.  Unfortunately,
  it can also have a (significant) negative impact.  In
  `matrix-multiply` and `vector-rev`, fusing can cause GCC to not
  recognize that an explicit sequence index can be replaced by a
  stride length; in these benchmarks, it would be nice if MLton
  eliminated the overflow checks.

  config command                                                                                                                          
  C04    /home/mtf/devel/mlton/builds/g098009d49/bin/mlton -codegen c -cc gcc-9                                                           
  C09    /home/mtf/devel/mlton/builds/g098009d49/bin/mlton -codegen llvm -cc clang                                                        
  C11    /home/mtf/devel/mlton/builds/g098009d49/bin/mlton -codegen c -cc gcc-9 -codegen-fuse-op-and-chk true                             
  C15    /home/mtf/devel/mlton/builds/g098009d49/bin/mlton -codegen llvm -cc clang -codegen-fuse-op-and-chk true                          

  task_clock [email protected] (2-level)
  program           `C11/C04` `C15/C09`
  barnes-hut           1.005     0.9925
  boyer                1.052     1.013 
  checksum             1.022     1.028 
  count-graphs         0.9722    1.002 
  DLXSimulator         1.004     0.9959
  even-odd             0.8768    1.003 
  fft                  0.9592    1.016 
  fib                  0.9732    0.9798
  flat-array           0.8148    1.019 
  hamlet               0.9966    1.030 
  imp-for              0.8993    0.7985
  knuth-bendix         1.008     1.013 
  lexgen               0.9851    1.043 
  life                 0.9954    1.006 
  logic                0.9994    1.014 
  mandelbrot           0.9440    1.013 
  matrix-multiply      1.336     1.009 
  md5                  0.9604    1.007 
  merge                0.9675    1.037 
  mlyacc               1.032     1.029 
  model-elimination    1.010     1.004 
  mpuz                 1.035     0.9599
  nucleic              0.9938    0.9983
  output1              0.9278    0.9709
  peek                 0.9850    1.035 
  pidigits             0.9702    0.9538
  psdes-random         1.017     0.9986
  ratio-regions        0.9801    0.9887
  ray                  0.9795    1.009 
  raytrace             0.9959    1.026 
  simple               0.9764    1.010 
  smith-normal-form    1.002     1.049 
  string-concat        0.7919    0.9035
  tailfib              1.030     1.227 
  tailmerge            1.017     0.9980
  tak                  0.9790    0.9988
  tensor               0.5258    1.000 
  tsp                  0.9845    1.013 
  tyan                 1.019     0.9739
  vector-rev           1.178     1.253 
  vector32-concat      0.8703    0.9230
  vector64-concat      0.8906    0.9038
  vliw                 0.9921    1.044 
  wc-input1            1.060     0.9809
  wc-scanStream        0.9166    1.040 
  zebra                1.008     1.020 
  zern                 1.051     1.089 
  MIN                  0.5258    0.7985
  GMEAN                0.9720    1.007 
  MAX                  1.336     1.253 

  Note: the issue with `md5` mentioned in the commit messages are with
  respect to the `md5` benchmark before 2daaebf.

Overall, this simplifies the C and LLVM codegen slightly, although
there is little significant performance change:

config command                                                                                                                          
C02    /home/mtf/devel/mlton/builds/g89891a411/bin/mlton -codegen c -cc gcc-9                                                           
C04    /home/mtf/devel/mlton/builds/g098009d49/bin/mlton -codegen c -cc gcc-9                                                           
C08    /home/mtf/devel/mlton/builds/g89891a411/bin/mlton -codegen llvm -cc clang                                                        
C09    /home/mtf/devel/mlton/builds/g098009d49/bin/mlton -codegen llvm -cc clang                                                        

task_clock [email protected] (2-level)
program           `C04/C02` `C09/C08`
barnes-hut           1.036     1.025 
boyer                0.9731    1.006 
checksum             0.9652    1.002 
count-graphs         0.9988    0.9964
DLXSimulator         0.9970    1.023 
even-odd             1.002     0.9881
fft                  1.026     0.9674
fib                  0.9034    0.7846
flat-array           1.014     1.021 
hamlet               0.9740    1.010 
imp-for              0.9707    0.9908
knuth-bendix         0.9077    0.9777
lexgen               1.048     0.8985
life                 1.002     0.9827
logic                1.006     0.9867
mandelbrot           1.000     1.011 
matrix-multiply      1.020     0.9957
md5                  0.9700    0.9960
merge                0.9974    0.9818
mlyacc               1.003     0.9824
model-elimination    0.9936    0.9817
mpuz                 0.9815    0.9466
nucleic              0.9946    1.002 
output1              1.007     1.026 
peek                 0.9832    0.9898
pidigits             0.9950    1.047 
psdes-random         1.009     0.9869
ratio-regions        0.9978    0.9725
ray                  0.9938    0.9663
raytrace             0.9975    1.032 
simple               0.9936    1.000 
smith-normal-form    1.038     0.9941
string-concat        1.041     1.014 
tailfib              0.9865    0.9741
tailmerge            1.010     1.020 
tak                  0.9331    0.9041
tensor               0.9938    0.9941
tsp                  0.9825    1.004 
tyan                 0.9960    0.9879
vector-rev           1.014     0.9091
vector32-concat      1.090     0.9016
vector64-concat      0.9994    0.9800
vliw                 0.9995    0.9876
wc-input1            0.9685    0.8634
wc-scanStream        1.178     1.105 
zebra                0.9857    0.9900
zern                 0.9733    0.9890
MIN                  0.9034    0.7846
GMEAN                0.9982    0.9815
MAX                  1.178     1.105
  • Loading branch information
MatthewFluet authored Nov 22, 2019
2 parents 60b338b + 29ae87c commit 7ab49d0
Show file tree
Hide file tree
Showing 81 changed files with 5,244 additions and 4,213 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ Here are the changes from version 20180206 to version YYYYMMDD.

=== Details

* 2019-11-22
** Many updates and improvements to C and LLVM codegens. See
https://github.com/MLton/mlton/pull/351 for details.

* 2019-11-05
** Change `OS.IO.poll` to not be restarted when interrupted by a
signal. (This is consistent with `Socket.select`.)
Expand Down
546 changes: 273 additions & 273 deletions basis-library/primitive/basis-ffi.sml

Large diffs are not rendered by default.

36 changes: 9 additions & 27 deletions basis-library/primitive/prim-real.sml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(* Copyright (C) 2012,2013 Matthew Fluet.
(* Copyright (C) 2012,2013,2019 Matthew Fluet.
* Copyright (C) 1999-2007 Henry Cejtin, Matthew Fluet, Suresh
* Jagannathan, and Stephen Weeks.
* Copyright (C) 1997-2000 NEC Research Institute.
Expand Down Expand Up @@ -103,6 +103,7 @@ open Primitive

structure Real32 : PRIM_REAL =
struct
open PrimitiveFFI.Real32
open Real32

val realSize : Int32.int = 32
Expand All @@ -113,24 +114,21 @@ structure Real32 : PRIM_REAL =
structure Math =
struct
type real = real
open Math

val acos = _prim "Real32_Math_acos": real -> real;
val asin = _prim "Real32_Math_asin": real -> real;
val atan = _prim "Real32_Math_atan": real -> real;
val atan2 = _prim "Real32_Math_atan2": real * real -> real;
val cos = _prim "Real32_Math_cos": real -> real;
val cosh = _import "Real32_Math_cosh" private: real -> real;
val e = #1 _symbol "Real32_Math_e" private: real GetSet.t; ()
val e = eGet ()
val exp = _prim "Real32_Math_exp": real -> real;
val ln = _prim "Real32_Math_ln": real -> real;
val log10 = _prim "Real32_Math_log10": real -> real;
val pi = #1 _symbol "Real32_Math_pi" private: real GetSet.t; ()
val pow = _import "Real32_Math_pow" private: real * real -> real;
val pi = piGet ()
val sin = _prim "Real32_Math_sin": real -> real;
val sinh = _import "Real32_Math_sinh" private: real -> real;
val sqrt = _prim "Real32_Math_sqrt": real -> real;
val tan = _prim "Real32_Math_tan": real -> real;
val tanh = _import "Real32_Math_tanh" private: real -> real;
end

val * = _prim "Real32_mul": real * real -> real;
Expand All @@ -145,15 +143,8 @@ structure Real32 : PRIM_REAL =
val == = _prim "Real32_equal": real * real -> bool;
val ?= = _prim "Real32_qequal": real * real -> bool;
val abs = _prim "Real32_abs": real -> real;
val frexp = _import "Real32_frexp" private: real * C_Int.t ref -> real;
val gdtoa = _import "Real32_gdtoa" private: real * C_Int.t * C_Int.t * C_Int.t * C_Int.t ref -> C_String.t;
val ldexp = _prim "Real32_ldexp": real * C_Int.t -> real;
val modf = _import "Real32_modf" private: real * real ref -> real;
val round = _prim "Real32_round": real -> real;
val realCeil = _import "Real32_realCeil" private: real -> real;
val realFloor = _import "Real32_realFloor" private: real -> real;
val realTrunc = _import "Real32_realTrunc" private: real -> real;
val strtor = _import "Real32_strtor" private: NullString8.t * C_Int.t -> real;

val fromInt8Unsafe = _prim "WordS8_rndToReal32": Int8.int -> real;
val fromInt16Unsafe = _prim "WordS16_rndToReal32": Int16.int -> real;
Expand Down Expand Up @@ -193,6 +184,7 @@ structure Real32 =

structure Real64 : PRIM_REAL =
struct
open PrimitiveFFI.Real64
open Real64

val realSize : Int32.int = 64
Expand All @@ -203,24 +195,21 @@ structure Real64 : PRIM_REAL =
structure Math =
struct
type real = real
open Math

val acos = _prim "Real64_Math_acos": real -> real;
val asin = _prim "Real64_Math_asin": real -> real;
val atan = _prim "Real64_Math_atan": real -> real;
val atan2 = _prim "Real64_Math_atan2": real * real -> real;
val cos = _prim "Real64_Math_cos": real -> real;
val cosh = _import "Real64_Math_cosh" private: real -> real;
val e = #1 _symbol "Real64_Math_e" private: real GetSet.t; ()
val e = eGet ()
val exp = _prim "Real64_Math_exp": real -> real;
val ln = _prim "Real64_Math_ln": real -> real;
val log10 = _prim "Real64_Math_log10": real -> real;
val pi = #1 _symbol "Real64_Math_pi" private: real GetSet.t; ()
val pow = _import "Real64_Math_pow" private: real * real -> real;
val pi = piGet ()
val sin = _prim "Real64_Math_sin": real -> real;
val sinh = _import "Real64_Math_sinh" private: real -> real;
val sqrt = _prim "Real64_Math_sqrt": real -> real;
val tan = _prim "Real64_Math_tan": real -> real;
val tanh = _import "Real64_Math_tanh" private: real -> real;
end

val * = _prim "Real64_mul": real * real -> real;
Expand All @@ -235,15 +224,8 @@ structure Real64 : PRIM_REAL =
val == = _prim "Real64_equal": real * real -> bool;
val ?= = _prim "Real64_qequal": real * real -> bool;
val abs = _prim "Real64_abs": real -> real;
val frexp = _import "Real64_frexp" private: real * C_Int.t ref -> real;
val gdtoa = _import "Real64_gdtoa" private: real * C_Int.t * C_Int.t * C_Int.t * C_Int.t ref -> C_String.t;
val ldexp = _prim "Real64_ldexp": real * C_Int.t -> real;
val modf = _import "Real64_modf" private: real * real ref -> real;
val round = _prim "Real64_round": real -> real;
val realCeil = _import "Real64_realCeil" private: real -> real;
val realFloor = _import "Real64_realFloor" private: real -> real;
val realTrunc = _import "Real64_realTrunc" private: real -> real;
val strtor = _import "Real64_strtor" private: NullString8.t * C_Int.t -> real;

val fromInt8Unsafe = _prim "WordS8_rndToReal64": Int8.int -> real;
val fromInt16Unsafe = _prim "WordS16_rndToReal64": Int16.int -> real;
Expand Down
8 changes: 8 additions & 0 deletions basis-library/primitive/prim1.sml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ structure Exn =
exception Span
exception Subscript

(* Fusing of adjacent `Word<N>_<op>` and `Word{S,U}<N>_<op>CheckP` primitives
* by the codegens may depend on the relative order of `!a` and `?a`;
* see:
* - /mlton/codegen/amd64-codegen/amd64-simplify.fun:elimALRedundant
* - /mlton/codegen/c-codegen/c-codegen.fun:outputStatementsFuseOpAndChk
* - /mlton/codegen/llvm-codegen/llvm-codegen.fun:outputStatementsFuseOpAndChk
* - /mlton/codegen/x86-codegen/x86-simplify.fun:elimALRedundant
*)
val mkOverflow: ('a -> 'b) * ('a -> bool) -> ('a -> 'b) =
fn (!, ?) => fn a =>
let val r = ! a
Expand Down
9 changes: 5 additions & 4 deletions basis-library/primitive/primitive.mlb
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(* Copyright (C) 2016-2017 Matthew Fluet.
(* Copyright (C) 2016-2017,2019 Matthew Fluet.
* Copyright (C) 2004-2007 Henry Cejtin, Matthew Fluet, Suresh
* Jagannathan, and Stephen Weeks.
*
Expand Down Expand Up @@ -59,6 +59,10 @@ in
prim-seq.sml
prim-nullstring.sml

prim-mlton.sml

basis-ffi.sml

prim-int-inf.sml

prim-char.sml
Expand All @@ -69,9 +73,6 @@ in
prim-pack-word.sml
prim-pack-real.sml

prim-mlton.sml

basis-ffi.sml
prim2.sml

(* Check compatibility between primitives and runtime functions. *)
Expand Down
13 changes: 2 additions & 11 deletions basis-library/real/real.sml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(* Copyright (C) 2011-2014,2017 Matthew Fluet.
(* Copyright (C) 2011-2014,2017,2019 Matthew Fluet.
* Copyright (C) 2003-2007 Henry Cejtin, Matthew Fluet, Suresh
* Jagannathan, and Stephen Weeks.
*
Expand Down Expand Up @@ -179,16 +179,7 @@ functor Real (structure W: WORD_EXTRA

fun isNormal r = class r = NORMAL

val op ?= =
if MLton.Codegen.isAMD64 orelse MLton.Codegen.isLLVM orelse MLton.Codegen.isX86
then R.?=
else
fn (x, y) =>
case (class x, class y) of
(NAN, _) => true
| (_, NAN) => true
| (ZERO, ZERO) => true
| _ => R.== (x, y)
val op ?= = R.?=

fun min (x, y) =
if x <= y then x
Expand Down
26 changes: 13 additions & 13 deletions include/amd64-main.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,11 @@ PRIVATE GC_state MLton_gcState() {
}

static GC_frameIndex returnAddressToFrameIndex (GC_returnAddress ra) {
return *((GC_frameIndex*)(ra - sizeof(GC_frameIndex)));
return *((GC_frameIndex*)(ra - sizeof(GC_frameIndex)));
}

static inline pointer getJumpFromStackTop (GC_state s) {
return *(pointer*)(s->stackTop - GC_RETURNADDRESS_SIZE);
}

PRIVATE void MLton_jumpToSML (pointer jump);
Expand All @@ -49,7 +53,6 @@ PRIVATE void MLton_jumpToSML (pointer jump);
static void MLton_callFromC (CPointer localOpArgsResPtr) { \
pointer jump; \
GC_state s = MLton_gcState(); \
\
if (DEBUG_AMD64CODEGEN) \
fprintf (stderr, "MLton_callFromC() starting\n"); \
s->callFromCOpArgsResPtr = localOpArgsResPtr; \
Expand All @@ -59,8 +62,8 @@ static void MLton_callFromC (CPointer localOpArgsResPtr) { \
s->limit = s->limitPlusSlop - GC_HEAP_LIMIT_SLOP; \
/* Return to the C Handler thread. */ \
GC_switchToThread (s, GC_getCallFromCHandlerThread (s), 0); \
jump = *(pointer*)(s->stackTop - GC_RETURNADDRESS_SIZE); \
MLton_jumpToSML(jump); \
jump = getJumpFromStackTop (s); \
MLton_jumpToSML (jump); \
s->atomicState += 1; \
GC_switchToThread (s, GC_getSavedThread (s), 0); \
s->atomicState -= 1; \
Expand All @@ -73,43 +76,40 @@ static void MLton_callFromC (CPointer localOpArgsResPtr) { \

#define MLtonMain(al, mg, mfs, mmc, pk, ps, ml) \
PUBLIC int MLton_main (int argc, char* argv[]) { \
pointer jump; \
extern unsigned char ml; \
pointer jump; \
GC_state s = MLton_gcState(); \
\
Initialize (s, al, mg, mfs, mmc, pk, ps); \
if (s->amOriginal) { \
real_Init(); \
static_Init(); \
jump = (pointer)&ml; \
} else { \
jump = *(pointer*)(s->stackTop - GC_RETURNADDRESS_SIZE); \
jump = getJumpFromStackTop (s); \
} \
MLton_jumpToSML(jump); \
MLton_jumpToSML (jump); \
return 1; \
}

#define MLtonLibrary(al, mg, mfs, mmc, pk, ps, ml) \
PUBLIC void LIB_OPEN(LIBNAME) (int argc, char* argv[]) { \
extern unsigned char ml; \
pointer jump; \
GC_state s = MLton_gcState(); \
extern unsigned char ml; \
\
Initialize (s, al, mg, mfs, mmc, pk, ps); \
if (s->amOriginal) { \
real_Init(); \
static_Init(); \
jump = (pointer)&ml; \
} else { \
jump = *(pointer*)(s->stackTop - GC_RETURNADDRESS_SIZE); \
jump = getJumpFromStackTop (s); \
} \
MLton_jumpToSML(jump); \
} \
PUBLIC void LIB_CLOSE(LIBNAME) () { \
pointer jump; \
GC_state s = MLton_gcState(); \
\
jump = *(pointer*)(s->stackTop - GC_RETURNADDRESS_SIZE); \
jump = getJumpFromStackTop (s); \
MLton_jumpToSML(jump); \
GC_done(s); \
}
Expand Down
Loading

0 comments on commit 7ab49d0

Please sign in to comment.