revise overload resolution for splats/truncations

Greg Roth · Greg Roth · commit c2f85a695659 · 2025-02-12T22:31:39.000-07:00
Allow truncations when matching arguments for intrinsic overloads. This eliminates the need for explicit scalar extractions from vectors for arguments that are scalar by nature. This encompasses any vectors passed for scalars, allowing the truncation, but emitting a warning the same as is done for other assignments of vectors to scalars. This maintains splats as the preferred transformations and promotes perfect matches to be preferred over that. This has the effect of removing the need to carefully order intrinsics to ensure that the right variant gets matched first before another one incorrectly takes its place with a faulty cast. Allowing truncations causes a problems with a small subset of intrinsics that have explicit overloads for various matrix,vector, scalar combinations. Namely the mul overloads. These could be simplified to accept a new range of template types except the dimensions need to be matched in unconventional ways. For these, the notion of uncastable or "ONLY" variants of the template/layout types are introduced. These are indicated with a trailing "!" after the parameter typename in gen_intrin_main, which directs them to an array that contains a NOCAST enum that, when encountered, will skip the attempts to splat or truncate. Fixes microsoft#7079
diff --git a/include/dxc/dxcapi.internal.h b/include/dxc/dxcapi.internal.h
@@ -46,8 +46,11 @@ enum LEGAL_INTRINSIC_TEMPLATES {
       4, // Any one of scalar, vector or matrix types (but not object).
   LITEMPLATE_OBJECT = 5, // Object types.
   LITEMPLATE_ARRAY = 6,  // Scalar array.
+  LITEMPLATE_SCALAR_ONLY = 7, // Uncastable scalar types.
+  LITEMPLATE_VECTOR_ONLY = 8, // Uncastable vector types (eg. float3).
+  LITEMPLATE_MATRIX_ONLY = 9, // Uncastable matrix types (eg. float3x3).
 
-  LITEMPLATE_COUNT = 7
+  LITEMPLATE_COUNT = 10
 };
 
 // INTRIN_COMPTYPE_FROM_TYPE_ELT0 is for object method intrinsics to indicate
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
@@ -4315,12 +4315,10 @@ void TranslateLoad(ResLoadHelper &helper, HLResource::Kind RK,
   loadArgs.emplace_back(opArg);         // opcode
   loadArgs.emplace_back(helper.handle); // resource handle
 
+  // offsets
   if (opcode == OP::OpCode::TextureLoad) {
     // set mip level
     loadArgs.emplace_back(helper.mipLevel);
-  }
-
-  if (opcode == OP::OpCode::TextureLoad) {
     // texture coord
     unsigned coordSize = DxilResource::GetNumCoords(RK);
     bool isVectorAddr = helper.addr->getType()->isVectorTy();
@@ -4332,22 +4330,6 @@ void TranslateLoad(ResLoadHelper &helper, HLResource::Kind RK,
       } else
         loadArgs.emplace_back(undefI);
     }
-  } else {
-    if (helper.addr->getType()->isVectorTy()) {
-      Value *scalarOffset =
-          Builder.CreateExtractElement(helper.addr, (uint64_t)0);
-
-      // TODO: calculate the real address based on opcode
-
-      loadArgs.emplace_back(scalarOffset); // offset
-    } else {
-      // TODO: calculate the real address based on opcode
-
-      loadArgs.emplace_back(helper.addr); // offset
-    }
-  }
-  // offset 0
-  if (opcode == OP::OpCode::TextureLoad) {
     if (helper.offset && !isa<llvm::UndefValue>(helper.offset)) {
       unsigned offsetSize = DxilResource::GetNumOffsets(RK);
       for (unsigned i = 0; i < 3; i++) {
@@ -4361,11 +4343,9 @@ void TranslateLoad(ResLoadHelper &helper, HLResource::Kind RK,
       loadArgs.emplace_back(undefI);
       loadArgs.emplace_back(undefI);
     }
-  }
-
-  // Offset 1
-  if (RK == DxilResource::Kind::TypedBuffer) {
-    loadArgs.emplace_back(undefI);
+  } else {
+    loadArgs.emplace_back(helper.addr); // c0
+    loadArgs.emplace_back(undefI); // c1
   }
 
   Value *ResRet = Builder.CreateCall(F, loadArgs, OP->GetOpCodeName(opcode));
@@ -4539,12 +4519,7 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
   if (RK == DxilResource::Kind::RawBuffer ||
       RK == DxilResource::Kind::TypedBuffer) {
     // Offset 0
-    if (offset->getType()->isVectorTy()) {
-      Value *scalarOffset = Builder.CreateExtractElement(offset, (uint64_t)0);
-      storeArgs.emplace_back(scalarOffset); // offset
-    } else {
-      storeArgs.emplace_back(offset); // offset
-    }
+    storeArgs.emplace_back(offset); // offset
 
     // Store offset0 for later use
     offset0Idx = storeArgs.size() - 1;
@@ -6278,7 +6253,7 @@ Value *StreamOutputLower(CallInst *CI, IntrinsicOp IOP, DXIL::OpCode opcode,
                          HLOperationLowerHelper &helper,
                          HLObjectOperationLowerHelper *pObjHelper,
                          bool &Translated) {
-  // Translated in DxilGenerationPass::GenerateStreamOutputOperation.
+  // Translated in HLSignatureLower::GenerateStreamOutputOperation.
   // Do nothing here.
   // Mark not translated.
   Translated = false;
@@ -7983,7 +7958,7 @@ static Value *ExtractFromTypedBufferLoad(const ResRetValueArray &ResRet,
     DXASSERT_NOMSG(FirstElemIdx <= ResRet.size() - ElemCount);
     for (unsigned ElemIdx = 0; ElemIdx < ElemCount; ++ElemIdx) {
       Elems.emplace_back(
-          ResRet[std::min<size_t>(FirstElemIdx + ElemIdx, ResRet.size() - 1)]);
+                         ResRet[std::min<size_t>(FirstElemIdx + ElemIdx, ResRet.size() - 1)]);// there is no way this is right. why add the offset here?
     }
   } else {
     Value *ArrayAlloca = SpillValuesToArrayAlloca(
@@ -8368,15 +8343,6 @@ void TranslateStructBufSubscriptUser(Instruction *user, Value *handle,
     if (group == HLOpcodeGroup::HLIntrinsic) {
       IntrinsicOp IOP = static_cast<IntrinsicOp>(opcode);
       switch (IOP) {
-      case IntrinsicOp::MOP_Load: {
-        if (userCall->getType()->isPointerTy()) {
-          // Struct will return pointers which like []
-
-        } else {
-          // Use builtin types on structuredBuffer.
-        }
-        DXASSERT(0, "not implement yet");
-      } break;
       case IntrinsicOp::IOP_InterlockedAdd: {
         AtomicHelper helper(userCall, DXIL::OpCode::AtomicBinOp, handle, bufIdx,
                             baseOffset);
diff --git a/lib/HLSL/HLSignatureLower.cpp b/lib/HLSL/HLSignatureLower.cpp
@@ -1653,8 +1653,7 @@ void HLSignatureLower::GenerateStreamOutputOperation(Value *streamVal,
     if (group == HLOpcodeGroup::NotHL)
       continue;
     unsigned opcode = GetHLOpcode(CI);
-    DXASSERT_LOCALVAR(group, group == HLOpcodeGroup::HLIntrinsic,
-                      "Must be HLIntrinsic here");
+    DXASSERT(group == HLOpcodeGroup::HLIntrinsic, "Must be HLIntrinsic here");
     IntrinsicOp IOP = static_cast<IntrinsicOp>(opcode);
     switch (IOP) {
     case IntrinsicOp::MOP_Append:
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -662,6 +662,7 @@ enum ArTypeObjectKind {
                      // indexer object used to implement .mips[1].
   AR_TOBJ_STRING,    // Represents a string
   AR_TOBJ_DEPENDENT, // Dependent type for template.
+  AR_TOBJ_NOCAST,    // Parameter should not have layout casts (splat,trunc)
 };
 
 enum TYPE_CONVERSION_FLAGS {
@@ -989,9 +990,15 @@ static const ArTypeObjectKind g_NullTT[] = {AR_TOBJ_VOID, AR_TOBJ_UNKNOWN};
 
 static const ArTypeObjectKind g_ArrayTT[] = {AR_TOBJ_ARRAY, AR_TOBJ_UNKNOWN};
 
+static const ArTypeObjectKind g_ScalarOnlyTT[] = {AR_TOBJ_SCALAR, AR_TOBJ_NOCAST, AR_TOBJ_UNKNOWN};
+
+static const ArTypeObjectKind g_VectorOnlyTT[] = {AR_TOBJ_VECTOR, AR_TOBJ_NOCAST, AR_TOBJ_UNKNOWN};
+
+static const ArTypeObjectKind g_MatrixOnlyTT[] = {AR_TOBJ_MATRIX, AR_TOBJ_NOCAST, AR_TOBJ_UNKNOWN};
+
 const ArTypeObjectKind *g_LegalIntrinsicTemplates[] = {
     g_NullTT, g_ScalarTT, g_VectorTT, g_MatrixTT,
-    g_AnyTT,  g_ObjectTT, g_ArrayTT,
+    g_AnyTT,  g_ObjectTT, g_ArrayTT, g_ScalarOnlyTT, g_VectorOnlyTT, g_MatrixOnlyTT,
 };
 C_ASSERT(ARRAYSIZE(g_LegalIntrinsicTemplates) == LITEMPLATE_COUNT);
 
@@ -6109,7 +6116,7 @@ bool HLSLExternalSource::MatchArguments(
   ArBasicKind
       ComponentType[MaxIntrinsicArgs]; // Component type for each argument,
                                        // AR_BASIC_UNKNOWN if unspecified.
-  UINT uSpecialSize[IA_SPECIAL_SLOTS]; // row/col matching types, UNUSED_INDEX32
+  UINT uSpecialSize[IA_SPECIAL_SLOTS]; // row/col matching types, UnusedSize
                                        // if unspecified.
   badArgIdx = MaxIntrinsicArgs;
 
@@ -6245,19 +6252,23 @@ bool HLSLExternalSource::MatchArguments(
         "otherwise intrinsic table was modified and g_MaxIntrinsicParamCount "
         "was not updated (or uTemplateId is out of bounds)");
 
-    // Compare template
+    // Compare template to any type matching params requirements.
     if ((AR_TOBJ_UNKNOWN == Template[pIntrinsicArg->uTemplateId]) ||
         ((AR_TOBJ_SCALAR == Template[pIntrinsicArg->uTemplateId]) &&
          (AR_TOBJ_VECTOR == TypeInfoShapeKind ||
           AR_TOBJ_MATRIX == TypeInfoShapeKind))) {
-      // Unrestricted or truncation of tuples to scalars are allowed
+      // Previous params gave no type restrictions
+      // or truncation of tuples to scalars are allowed
+      // I'm not sure this ever results in truncations.
+      // Later steps harmonize common typed params and will always convert the earlier arg into a splat instead.
       Template[pIntrinsicArg->uTemplateId] = TypeInfoShapeKind;
     } else if (AR_TOBJ_SCALAR == TypeInfoShapeKind) {
       if (AR_TOBJ_SCALAR != Template[pIntrinsicArg->uTemplateId] &&
           AR_TOBJ_VECTOR != Template[pIntrinsicArg->uTemplateId] &&
           AR_TOBJ_MATRIX != Template[pIntrinsicArg->uTemplateId]) {
         // Scalars to tuples can be splatted, scalar to anything else is not
         // allowed
+        // Dead code. The only params with requirements are already one of these three.
         badArgIdx = std::min(badArgIdx, iArg);
       }
     } else {
@@ -6288,6 +6299,11 @@ bool HLSLExternalSource::MatchArguments(
       }
     }
 
+    // If the intrinsic parameter has variable rows or columns but must match
+    // other argument dimensions, it will be specified in pIntrinsicArg with
+    // a special value indicating that the dimension depends on the passed values.
+    // uSpecialSize stores the dimensions of the actual passed type.
+
     // Rows
     if (AR_TOBJ_SCALAR != TypeInfoShapeKind) {
       if (pIntrinsicArg->uRows >= IA_SPECIAL_BASE) {
@@ -6394,18 +6410,37 @@ bool HLSLExternalSource::MatchArguments(
     const ArTypeObjectKind *pTT =
         g_LegalIntrinsicTemplates[pArgument->uLegalTemplates];
     if (AR_TOBJ_UNKNOWN != Template[i]) {
-      if ((AR_TOBJ_SCALAR == Template[i]) &&
-          (AR_TOBJ_VECTOR == *pTT || AR_TOBJ_MATRIX == *pTT)) {
-        Template[i] = *pTT;
-      } else {
+      // See if a perfect match overload is available
+      while (AR_TOBJ_UNKNOWN != *pTT && AR_TOBJ_NOCAST != *pTT) {
+        if (Template[i] == *pTT)
+          break;
+        pTT++;
+      }
+
+      if (AR_TOBJ_UNKNOWN == *pTT) {
+        // Perfect match failed and casts are allowed.
+        // Try splats and truncations to get a match.
+        pTT = g_LegalIntrinsicTemplates[pArgument->uLegalTemplates];
         while (AR_TOBJ_UNKNOWN != *pTT) {
-          if (Template[i] == *pTT)
+          if (AR_TOBJ_SCALAR == Template[i] &&
+              (AR_TOBJ_VECTOR == *pTT || AR_TOBJ_MATRIX == *pTT)) {
+            // If a scalar was passed in and the expected value was matrix/vector
+            // convert to the template type for a splat.
+            // Only applicable to VectorTT and MatrixTT, since the vec/mtx has to be first in the list.
+            Template[i] = *pTT;
             break;
+          } else if (AR_TOBJ_VECTOR == Template[i] && AR_TOBJ_SCALAR == *pTT) {
+            // If a vector was passed in and the expected value was scalar
+            // convert to the template type for a truncation.
+            // Only applicable to ScalarTT, since the scalar has to be first in the list.
+            Template[i] = AR_TOBJ_SCALAR;
+            break;
+          }
           pTT++;
         }
       }
 
-      if (AR_TOBJ_UNKNOWN == *pTT) {
+      if (AR_TOBJ_UNKNOWN == *pTT || AR_TOBJ_NOCAST == *pTT) {
         Template[i] = g_LegalIntrinsicTemplates[pArgument->uLegalTemplates][0];
         badArgIdx = std::min(badArgIdx, i);
       }
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/intrinsics/compound/refract.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/intrinsics/compound/refract.hlsl
@@ -13,14 +13,15 @@
 // RUN: %dxc -E main -T vs_6_2 -DTY1=float3 -DTY2=bool -enable-16bit-types %s | FileCheck %s
 // RUN: %dxc -E main -T vs_6_2 -DTY1=float4 -DTY2=uint16_t -enable-16bit-types %s | FileCheck %s
 
+// RUN: %dxc -E main -T vs_6_2 -DTY1=float4 -DTY2=uint16_t4 -enable-16bit-types %s | FileCheck %s -check-prefix=CHECK
+// RUN: %dxc -E main -T vs_6_2 -DTY1=float4 -DTY2=float16_t2 -enable-16bit-types %s | FileCheck %s -check-prefix=CHECK
+
 // RUN: %dxc -E main -T vs_6_2 -DTY1=float4 -DTY2=uint16_t4x4 -enable-16bit-types %s | FileCheck %s -check-prefix=CHECK_ERROR
-// RUN: %dxc -E main -T vs_6_2 -DTY1=float4 -DTY2=uint16_t4 -enable-16bit-types %s | FileCheck %s -check-prefix=CHECK_ERROR
-// RUN: %dxc -E main -T vs_6_2 -DTY1=float4 -DTY2=float16_t2 -enable-16bit-types %s | FileCheck %s -check-prefix=CHECK_ERROR
 // RUN: %dxc -E main -T vs_6_2 -DTY1=uint16_t4x4 -DTY2=float16_t -enable-16bit-types %s | FileCheck %s -check-prefix=CHECK_ERROR
 
 // CHECK: define void @main()
 // CHECK_ERROR: note: candidate function not viable: no known conversion from
 
 TY1 main (TY1 a: IN0, TY1 b : IN1, TY2 c : IN2) : OUT {
    return refract(a, b, c);
-}
+}
diff --git a/tools/clang/test/SemaHLSL/intrinsic-examples.hlsl b/tools/clang/test/SemaHLSL/intrinsic-examples.hlsl
@@ -18,10 +18,10 @@ float4 RWByteAddressBufferMain(uint2 a : A, uint2 b : B) : SV_Target
   uint status;
   // TODO - fix the following error - the subscript exist, but the indexer type is incorrect - message is misleading
   r += uav1[b]; // expected-error {{type 'RWByteAddressBuffer' does not provide a subscript operator}} fxc-error {{X3121: array, matrix, vector, or indexable object type expected in index expression}}
-  r += uav1.Load(a); // expected-error {{no matching member function for call to 'Load'}} expected-note {{candidate function template not viable: requires 2 arguments, but 1 was provided}} fxc-error {{X3013:     RWByteAddressBuffer<uint>.Load(uint)}} fxc-error {{X3013:     RWByteAddressBuffer<uint>.Load(uint, out uint status)}} fxc-error {{X3013: 'Load': no matching 1 parameter intrinsic method}} fxc-error {{X3013: Possible intrinsic methods are:}}
-  uav1.Load(a, status); // expected-error {{no matching member function for call to 'Load'}} expected-note {{candidate function template not viable: requires single argument 'byteOffset', but 2 arguments were provided}} fxc-error {{X3013:     RWByteAddressBuffer<uint>.Load(uint)}} fxc-error {{X3013:     RWByteAddressBuffer<uint>.Load(uint, out uint status)}} fxc-error {{X3013: 'Load': no matching 2 parameter intrinsic method}} fxc-error {{X3013: Possible intrinsic methods are:}}
+  r += uav1.Load(a); // expected-warning {{implicit truncation of vector type}} fxc-error {{X3013:     RWByteAddressBuffer<uint>.Load(uint)}} fxc-error {{X3013:     RWByteAddressBuffer<uint>.Load(uint, out uint status)}} fxc-error {{X3013: 'Load': no matching 1 parameter intrinsic method}} fxc-error {{X3013: Possible intrinsic methods are:}}
+  uav1.Load(a, status); // expected-warning {{implicit truncation of vector type}} fxc-error {{X3013:     RWByteAddressBuffer<uint>.Load(uint)}} fxc-error {{X3013:     RWByteAddressBuffer<uint>.Load(uint, out uint status)}} fxc-error {{X3013: 'Load': no matching 2 parameter intrinsic method}} fxc-error {{X3013: Possible intrinsic methods are:}}
   r += status;
-  uav1.Load(a, status); // expected-error {{no matching member function for call to 'Load'}} expected-note {{requires single argument 'byteOffset', but 2 arguments were provided}} fxc-error {{X3013:     RWByteAddressBuffer<uint>.Load(uint)}} fxc-error {{X3013:     RWByteAddressBuffer<uint>.Load(uint, out uint status)}} fxc-error {{X3013: 'Load': no matching 2 parameter intrinsic method}} fxc-error {{X3013: Possible intrinsic methods are:}}
+  uav1.Load(a, status); // expected-warning {{implicit truncation of vector type}} fxc-error {{X3013:     RWByteAddressBuffer<uint>.Load(uint)}} fxc-error {{X3013:     RWByteAddressBuffer<uint>.Load(uint, out uint status)}} fxc-error {{X3013: 'Load': no matching 2 parameter intrinsic method}} fxc-error {{X3013: Possible intrinsic methods are:}}
   r += status;
   uav1[b] = r; // expected-error {{type 'RWByteAddressBuffer' does not provide a subscript operator}} fxc-error {{X3121: array, matrix, vector, or indexable object type expected in index expression}}
   uav1.Load(a.x, status);
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
@@ -198,15 +198,15 @@ $type1 [[rn,unsigned_op=umax]] max(in numeric<> a, in $type1 b);
 $type1 [[rn,unsigned_op=umin]] min(in numeric<> a, in $type1 b);
 $type1 [[]] modf(in float_like<> x, out $type1 ip);
 uint<4> [[rn]] msad4(in uint reference, in uint<2> source, in uint<4> accum);
-numeric [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric b) : mul_ss;
-numeric<c2> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric<c2> b) : mul_sv;
-numeric<r2, c2> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric a, in $match<2, 0> numeric<r2, c2> b) : mul_sm;
-numeric<c> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<c> a, in $match<2, 0> numeric b) : mul_vs;
-numeric [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<c> a, in $match<2, 0> numeric<c> b) : mul_vv;
-numeric<c2> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<c> a, in col_major $match<2, 0> numeric<c, c2> b) : mul_vm;
-numeric<r, c> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<r, c> a, in $match<2, 0> numeric b) : mul_ms;
-numeric<r> [[rn,unsigned_op=umul]] mul(in row_major $match<1, 0> numeric<r, c> a, in $match<2, 0> numeric<c> b) : mul_mv;
-numeric<r, c2> [[rn,unsigned_op=umul]] mul(in row_major $match<1, 0> numeric<r, c> a, in col_major $match<2, 0> numeric<c, c2> b) : mul_mm;
+numeric [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric! a, in $match<2, 0> numeric! b) : mul_ss;
+numeric<c2> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric! a, in $match<2, 0> numeric<c2>! b) : mul_sv;
+numeric<r2, c2> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric! a, in $match<2, 0> numeric<r2, c2>! b) : mul_sm;
+numeric<c> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<c>! a, in $match<2, 0> numeric! b) : mul_vs;
+numeric [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<c>! a, in $match<2, 0> numeric<c>! b) : mul_vv;
+numeric<c2> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<c>! a, in col_major $match<2, 0> numeric<c, c2>! b) : mul_vm;
+numeric<r, c> [[rn,unsigned_op=umul]] mul(in $match<1, 0> numeric<r, c>! a, in $match<2, 0> numeric! b) : mul_ms;
+numeric<r> [[rn,unsigned_op=umul]] mul(in row_major $match<1, 0> numeric<r, c>! a, in $match<2, 0> numeric<c>! b) : mul_mv;
+numeric<r, c2> [[rn,unsigned_op=umul]] mul(in row_major $match<1, 0> numeric<r, c>! a, in col_major $match<2, 0> numeric<c, c2>! b) : mul_mm;
 $type1 [[rn]] normalize(in float_like<c> x);
 $type1 [[rn]] pow(in float_like<> x, in $type1 y);
 void [[]] printf(in string Format, ...);
@@ -849,8 +849,8 @@ $match<0, -1> void<4> [[]] GatherCmpAlpha(in sampler_cmp s, in float<4> x, in fl
 namespace BufferMethods {
 
 void [[]] GetDimensions(out uint_only width) : bufinfo;
-$classT [[ro]] Load(in int<1> x) : buffer_load;
-$classT [[]] Load(in int<1> x, out uint_only status) : buffer_load_s;
+$classT [[ro]] Load(in int x) : buffer_load;
+$classT [[]] Load(in int x, out uint_only status) : buffer_load_s;
 
 } namespace
 
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py