EnzymeAD
diff --git a/‎src/mlir/Dialects/Arith.jl
Lines changed: 142 additions & 6 deletions b/‎src/mlir/Dialects/Arith.jl
Lines changed: 142 additions & 6 deletions
diff --git a/‎src/mlir/Dialects/Builtin.jl
Lines changed: 1 addition & 1 deletion b/‎src/mlir/Dialects/Builtin.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/mlir/Dialects/Gpu.jl
Lines changed: 8 additions & 6 deletions b/‎src/mlir/Dialects/Gpu.jl
Lines changed: 8 additions & 6 deletions
diff --git a/‎src/mlir/Dialects/Llvm.jl
Lines changed: 6 additions & 3 deletions b/‎src/mlir/Dialects/Llvm.jl
Lines changed: 6 additions & 3 deletions
diff --git a/‎src/mlir/Dialects/MemRef.jl
Lines changed: 1 addition & 1 deletion b/‎src/mlir/Dialects/MemRef.jl
Lines changed: 1 addition & 1 deletion
@@ -77,7 +77,7 @@ The `addi` operation takes two operands and returns one result, each of
 these is required to be the same type. This type may be an integer scalar type, 
 a vector whose element type is integer, or a tensor of integers.
 
-This op supports `nuw`/`nsw` overflow flags which stands stand for
+This op supports `nuw`/`nsw` overflow flags which stands for
 \"No Unsigned Wrap\" and \"No Signed Wrap\", respectively. If the `nuw` and/or
 `nsw` flags are present, and an unsigned/signed overflow occurs
 (respectively), the result is poison.
@@ -1193,7 +1193,7 @@ The `muli` operation takes two operands and returns one result, each of
 these is required to be the same type. This type may be an integer scalar type,
 a vector whose element type is integer, or a tensor of integers.
 
-This op supports `nuw`/`nsw` overflow flags which stands stand for
+This op supports `nuw`/`nsw` overflow flags which stands for
 \"No Unsigned Wrap\" and \"No Signed Wrap\", respectively. If the `nuw` and/or
 `nsw` flags are present, and an unsigned/signed overflow occurs
 (respectively), the result is poison.
@@ -1578,6 +1578,129 @@ function sitofp(in::Value; out::IR.Type, location=Location())
     )
 end
 
+"""
+`scaling_extf`
+
+This operation upcasts input floating-point values using provided scale 
+values. It expects both scales and the input operand to be of the same shape, 
+making the operation elementwise. Scales are usually calculated per block 
+following the OCP MXFP spec as described in https://arxiv.org/abs/2310.10537.
+
+If scales are calculated per block where blockSize != 1, then scales may 
+require broadcasting to make this operation elementwise. For example, let\'s 
+say the input is of shape `<dim1 x dim2 x ... dimN>`. Given blockSize != 1 and 
+assuming quantization happens on the last axis, the input can be reshaped to 
+`<dim1 x dim2 x ... (dimN/blockSize) x blockSize>`. Scales will be calculated 
+per block on the last axis. Therefore, scales will be of shape 
+`<dim1 x dim2 x ... (dimN/blockSize) x 1>`. Scales could also be of some other 
+shape as long as it is broadcast compatible with the input, e.g., 
+`<1 x 1 x ... (dimN/blockSize) x 1>`.
+
+In this example, before calling into `arith.scaling_extf`, scales must be 
+broadcasted to `<dim1 x dim2 x dim3 ... (dimN/blockSize) x blockSize>`. Note 
+that there could be multiple quantization axes. Internally, 
+`arith.scaling_extf` would perform the following:
+ 
+  ```
+  resultTy = get_type(result) 
+  scaleTy  = get_type(scale)
+  inputTy = get_type(input)
+  scale.exponent = arith.truncf(scale) : scaleTy to f8E8M0
+  scale.extf = arith.extf(scale.exponent) : f8E8M0 to resultTy
+  input.extf = arith.extf(input) : inputTy to resultTy
+  result = arith.mulf(scale.extf, input.extf)
+  ```
+  It propagates NaN values. Therefore, if either scale or the input element 
+  contains NaN, then the output element value will also be a NaN.
+"""
+function scaling_extf(
+    in::Value, scale::Value; out::IR.Type, fastmath=nothing, location=Location()
+)
+    op_ty_results = IR.Type[out,]
+    operands = Value[in, scale]
+    owned_regions = Region[]
+    successors = Block[]
+    attributes = NamedAttribute[]
+    !isnothing(fastmath) && push!(attributes, namedattribute("fastmath", fastmath))
+
+    return create_operation(
+        "arith.scaling_extf",
+        location;
+        operands,
+        owned_regions,
+        successors,
+        attributes,
+        results=op_ty_results,
+        result_inference=false,
+    )
+end
+
+"""
+`scaling_truncf`
+
+This operation downcasts input using the provided scale values. It expects 
+both scales and the input operand to be of the same shape and, therefore, 
+makes the operation elementwise. Scales are usually calculated per block 
+following the OCP MXFP spec as described in https://arxiv.org/abs/2310.10537.
+Users are required to normalize and clamp the scales as necessary before calling
+passing them to this operation.  OCP MXFP spec also does the flushing of denorms
+on the input operand, which should be handled during lowering by passing appropriate 
+fastMath flag to this operation. 
+
+If scales are calculated per block where blockSize != 1, scales may require 
+broadcasting to make this operation elementwise. For example, let\'s say the 
+input is of shape `<dim1 x dim2 x ... dimN>`. Given blockSize != 1 and 
+assuming quantization happens on the last axis, the input can be reshaped to 
+`<dim1 x dim2 x ... (dimN/blockSize) x blockSize>`. Scales will be calculated 
+per block on the last axis. Therefore, scales will be of shape 
+`<dim1 x dim2 x ... (dimN/blockSize) x 1>`. Scales could also be of some other 
+shape as long as it is broadcast compatible with the input, e.g., 
+`<1 x 1 x ... (dimN/blockSize) x 1>`.
+
+In this example, before calling into `arith.scaling_truncf`, scales must be 
+broadcasted to `<dim1 x dim2 x dim3 ... (dimN/blockSize) x blockSize>`. Note 
+that there could be multiple quantization axes. Internally, 
+`arith.scaling_truncf` would perform the following:
+
+```
+scaleTy = get_type(scale)
+inputTy = get_type(input)
+resultTy = get_type(result)
+scale.exponent = arith.truncf(scale) : scaleTy to f8E8M0
+scale.extf = arith.extf(scale.exponent) : f8E8M0 to inputTy
+result = arith.divf(input, scale.extf)
+result.cast = arith.truncf(result, resultTy)
+```
+"""
+function scaling_truncf(
+    in::Value,
+    scale::Value;
+    out::IR.Type,
+    roundingmode=nothing,
+    fastmath=nothing,
+    location=Location(),
+)
+    op_ty_results = IR.Type[out,]
+    operands = Value[in, scale]
+    owned_regions = Region[]
+    successors = Block[]
+    attributes = NamedAttribute[]
+    !isnothing(roundingmode) &&
+        push!(attributes, namedattribute("roundingmode", roundingmode))
+    !isnothing(fastmath) && push!(attributes, namedattribute("fastmath", fastmath))
+
+    return create_operation(
+        "arith.scaling_truncf",
+        location;
+        operands,
+        owned_regions,
+        successors,
+        attributes,
+        results=op_ty_results,
+        result_inference=false,
+    )
+end
+
 """
 `shli`
 
@@ -1587,7 +1710,7 @@ unsigned. The low order bits are filled with zeros. If the value of the second
 operand is greater or equal than the bitwidth of the first operand, then the
 operation returns poison.
 
-This op supports `nuw`/`nsw` overflow flags which stands stand for
+This op supports `nuw`/`nsw` overflow flags which stands for
 \"No Unsigned Wrap\" and \"No Signed Wrap\", respectively. If the `nuw` and/or
 `nsw` flags are present, and an unsigned/signed overflow occurs
 (respectively), the result is poison.
@@ -1775,7 +1898,7 @@ The `subi` operation takes two operands and returns one result, each of
 these is required to be the same type. This type may be an integer scalar type,
 a vector whose element type is integer, or a tensor of integers.
 
-This op supports `nuw`/`nsw` overflow flags which stands stand for
+This op supports `nuw`/`nsw` overflow flags which stands for
 \"No Unsigned Wrap\" and \"No Signed Wrap\", respectively. If the `nuw` and/or
 `nsw` flags are present, and an unsigned/signed overflow occurs
 (respectively), the result is poison.
@@ -1865,22 +1988,35 @@ width M and an integer destination type of width N. The destination
 bit-width must be smaller than the input bit-width (N < M).
 The top-most (N - M) bits of the input are discarded.
 
+This op supports `nuw`/`nsw` overflow flags which stands for \"No Unsigned
+Wrap\" and \"No Signed Wrap\", respectively. If the nuw keyword is present,
+and any of the truncated bits are non-zero, the result is a poison value.
+If the nsw keyword is present, and any of the truncated bits are not the
+same as the top bit of the truncation result, the result is a poison value.
+
 # Example
 
 ```mlir
+  // Scalar truncation.
   %1 = arith.constant 21 : i5     // %1 is 0b10101
   %2 = arith.trunci %1 : i5 to i4 // %2 is 0b0101
   %3 = arith.trunci %1 : i5 to i3 // %3 is 0b101
 
-  %5 = arith.trunci %0 : vector<2 x i32> to vector<2 x i16>
+  // Vector truncation.
+  %4 = arith.trunci %0 : vector<2 x i32> to vector<2 x i16>
+
+  // Scalar truncation with overflow flags.
+  %5 = arith.trunci %a overflow<nsw, nuw> : i32 to i16
 ```
 """
-function trunci(in::Value; out::IR.Type, location=Location())
+function trunci(in::Value; out::IR.Type, overflowFlags=nothing, location=Location())
     op_ty_results = IR.Type[out,]
     operands = Value[in,]
     owned_regions = Region[]
     successors = Block[]
     attributes = NamedAttribute[]
+    !isnothing(overflowFlags) &&
+        push!(attributes, namedattribute("overflowFlags", overflowFlags))
 
     return create_operation(
         "arith.trunci",
 
@@ -20,7 +20,7 @@ A `module` represents a top-level container operation. It contains a single
 [graph region](../LangRef.md#control-flow-and-ssacfg-regions) containing a single block
 which can contain any operations and does not have a terminator. Operations
 within this region cannot implicitly capture values defined outside the module,
-i.e. Modules are [IsolatedFromAbove](../Traits.md#isolatedfromabove). Modules have
+i.e. Modules are [IsolatedFromAbove](../Traits#isolatedfromabove). Modules have
 an optional [symbol name](../SymbolsAndSymbolTables.md) which can be used to refer
 to them in operations.
 
 
@@ -130,7 +130,7 @@ end
 """
 `barrier`
 
-The \"barrier\" op synchronizes all work items of a workgroup. It is used
+The `barrier` op synchronizes all work items of a workgroup. It is used
 to coordinate communication between the work items of the workgroup.
 
 ```mlir
@@ -322,7 +322,7 @@ Returns the block id within the cluster along the x, y, or z `dimension`.
 ```
 
 If `upper_bound` is set, then executing (a lowering of) this operation in an
-environment where the number of thread blocks per cluster  along `dimension`
+environment where the number of thread blocks per cluster along `dimension`
 is greater than `upper_bound` causes undefined behavior.
 
 There is an implicit upper bound of `kMaxClusterDim` (currently 8).
@@ -1905,7 +1905,7 @@ end
 """
 `return_`
 
-A terminator operation for regions that appear in the body of  `gpu.func`
+A terminator operation for regions that appear in the body of `gpu.func`
 functions. The operands to the `gpu.return` are the result values returned
 by an invocation of the `gpu.func`.
 """
@@ -2141,7 +2141,8 @@ trades value with exactly one other lane.
 %3, %4 = gpu.shuffle down %0, %cst1, %width : f32
 ```
 
-For lane `k`, returns the value from lane `(k + 1) % width`.
+For lane `k`, returns the value from lane `(k + cst1)`. If `(k + cst1)` is
+bigger than or equal to `width`, the value is poison and `valid` is `false`.
 
 `up` example:
 
@@ -2150,7 +2151,8 @@ For lane `k`, returns the value from lane `(k + 1) % width`.
 %5, %6 = gpu.shuffle up %0, %cst1, %width : f32
 ```
 
-For lane `k`, returns the value from lane `(k - 1) % width`.
+For lane `k`, returns the value from lane `(k - cst1)`. If `(k - cst1)` is
+smaller than `0`, the value is poison and `valid` is `false`.
 
 `idx` example:
 
@@ -3412,7 +3414,7 @@ end
 """
 `yield`
 
-gpu.yield` is a special terminator operation for blocks inside regions
+`gpu.yield` is a special terminator operation for blocks inside regions
 in gpu ops. It returns values to the immediately enclosing gpu op.
 
 # Example
 
@@ -580,7 +580,6 @@ function call(
     var_callee_type=nothing,
     callee=nothing,
     fastmathFlags=nothing,
-    branch_weights=nothing,
     CConv=nothing,
     TailCallKind=nothing,
     memory_effects=nothing,
@@ -615,8 +614,6 @@ function call(
     !isnothing(callee) && push!(attributes, namedattribute("callee", callee))
     !isnothing(fastmathFlags) &&
         push!(attributes, namedattribute("fastmathFlags", fastmathFlags))
-    !isnothing(branch_weights) &&
-        push!(attributes, namedattribute("branch_weights", branch_weights))
     !isnothing(CConv) && push!(attributes, namedattribute("CConv", CConv))
     !isnothing(TailCallKind) &&
         push!(attributes, namedattribute("TailCallKind", TailCallKind))
@@ -1854,6 +1851,8 @@ function func(;
     frame_pointer=nothing,
     target_cpu=nothing,
     tune_cpu=nothing,
+    reciprocal_estimates=nothing,
+    prefer_vector_width=nothing,
     target_features=nothing,
     unsafe_fp_math=nothing,
     no_infs_fp_math=nothing,
@@ -1927,6 +1926,10 @@ function func(;
         push!(attributes, namedattribute("frame_pointer", frame_pointer))
     !isnothing(target_cpu) && push!(attributes, namedattribute("target_cpu", target_cpu))
     !isnothing(tune_cpu) && push!(attributes, namedattribute("tune_cpu", tune_cpu))
+    !isnothing(reciprocal_estimates) &&
+        push!(attributes, namedattribute("reciprocal_estimates", reciprocal_estimates))
+    !isnothing(prefer_vector_width) &&
+        push!(attributes, namedattribute("prefer_vector_width", prefer_vector_width))
     !isnothing(target_features) &&
         push!(attributes, namedattribute("target_features", target_features))
     !isnothing(unsafe_fp_math) &&
 
@@ -327,7 +327,7 @@ end
 The `alloca` operation allocates memory on the stack, to be automatically
 released when control transfers back from the region of its closest
 surrounding operation with an
-[`AutomaticAllocationScope`](../Traits.md/#automaticallocationscope) trait.
+[`AutomaticAllocationScope`](../Traits/#automaticallocationscope) trait.
 The amount of memory allocated is specified by its memref and additional
 operands. For example: