NVIDIA
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/codegen.cpp‎
Lines changed: 18 additions & 35 deletions b/‎csrc/codegen.cpp‎
Lines changed: 18 additions & 35 deletions
diff --git a/‎csrc/device_lower/analysis/default_val.cpp‎
Lines changed: 57 additions & 0 deletions b/‎csrc/device_lower/analysis/default_val.cpp‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎csrc/device_lower/analysis/default_val.h‎
Lines changed: 35 additions & 0 deletions b/‎csrc/device_lower/analysis/default_val.h‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎csrc/device_lower/analysis/fusion_info.h‎
Lines changed: 8 additions & 0 deletions b/‎csrc/device_lower/analysis/fusion_info.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎csrc/device_lower/lower2device.cpp‎
Lines changed: 4 additions & 0 deletions b/‎csrc/device_lower/lower2device.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎csrc/device_lower/pass/allocation.cpp‎
Lines changed: 23 additions & 36 deletions b/‎csrc/device_lower/pass/allocation.cpp‎
Lines changed: 23 additions & 36 deletions
@@ -193,6 +193,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/debug.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/analysis/bank_conflict.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/analysis/circular_buffer.cpp
+  ${NVFUSER_SRCS_DIR}/device_lower/analysis/default_val.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/analysis/device_version.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/analysis/divisible_split.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/analysis/fused_reduction.cpp
 
@@ -1426,14 +1426,21 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
           return std::ranges::find(sorted_ids, id) != sorted_ids.end();
         });
 
-    // At this moment, we only support argsort on thread parallelized
-    // dimensions. No serial dimension is allowed either.
     ParallelTypeBitmap sorted_parallel_types;
+    IterDomain* grouped_id = nullptr;
     for (auto id : sorted_loop_ids) {
-      NVF_ERROR(
-          isParallelTypeThreadDim(id->getParallelType()),
-          "Argsort on non-thread dimension is not supported");
-      sorted_parallel_types.set(id->getParallelType());
+      if (isParallelTypeThreadDim(id->getParallelType())) {
+        sorted_parallel_types.set(id->getParallelType());
+      } else if (id->getParallelType() == ParallelType::Group) {
+        NVF_ERROR(
+            grouped_id == nullptr,
+            "Multiple grouped IDs not supported: ",
+            aop->toString());
+        grouped_id = id;
+      } else {
+        NVF_THROW(
+            "Invalid parallel type: ", id->toString(), " of ", aop->toString());
+      }
     }
 
     // TID parallel types must only be used for the sorted IDs with the static
@@ -1469,8 +1476,9 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
       }
     }
 
-    // TODO: support ITEMS_PER_THREAD > 1
-    constexpr int items_per_thread = 1;
+    const int64_t items_per_thread = grouped_id != nullptr
+        ? grouped_id->extent()->evaluate().as<int64_t>()
+        : 1;
 
     const auto input = aop->in()->as<kir::TensorIndex>();
 
@@ -1479,33 +1487,8 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
 
     // Call the runtime argsort function
     ArgumentBuilder func_args;
-
-    // The output tensor is assumed to be a register tensor, and thus
-    // its storage should always be available without predication
-    NVF_ERROR_EQ(
-        output->view()->getMemoryType(),
-        MemoryType::Local,
-        "Argsort output must be a Local tensor: ",
-        output->toString());
-    func_args.arg("*(int64_t(*)[")
-        .append(items_per_thread)
-        .append("])")
-        .append("(&")
-        .append(genInline(output))
-        .append(")");
-
-    NVF_ERROR(aop->predicate() != nullptr && aop->predicate()->hasValue());
-    // {pred ? input : (isDescending ? min : max)}
-    func_args.arg("{")
-        .append(genInline(aop->predicate()))
-        .append(" ? ")
-        .append(genInline(input))
-        .append(" : ")
-        .append(
-            aop->isDescending() ? getMinimumValue(input->dtype())
-                                : getMaximumValue(input->dtype()))
-        .append("}");
-
+    func_args.arg(genVariableNameConvertAlignedArray(output));
+    func_args.arg(genVariableNameConvertAlignedArray(input));
     func_args.arg(aop->isDescending() ? "true" : "false"); // descending flag
     func_args.arg(genComputeBlockDim());
 
 
@@ -0,0 +1,57 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+
+#include <device_lower/analysis/default_val.h>
+#include <fusion.h>
+#include <ir/internal_nodes.h>
+#include <ir/utils.h>
+#include <ops/utils.h>
+
+namespace nvfuser {
+
+TensorDefaultVal::TensorDefaultVal(Fusion* fusion) {
+  for (auto expr: fusion->exprs()) {
+    dispatch(expr);
+  }
+}
+
+void TensorDefaultVal::handle(ArgsortOp* aop) {
+  // It is already validated that the input is exclusively used by
+  // this argsort op, so it's free to initialize it for this op
+  auto inp_tv = ir_utils::getTvInput(aop);
+  
+  Val* default_val = nullptr;
+  if (aop->isDescending()) {
+    default_val = ops::getMinimumValue(inp_tv->dtype());
+  } else {
+    default_val = ops::getMaximumValue(inp_tv->dtype());
+  }
+
+  registerDefaultVal(inp_tv, default_val);
+}
+
+void TensorDefaultVal::registerDefaultVal(TensorView* tv, Val* val) {
+  auto inserted = default_val_map_.emplace(tv, val).second;
+  if (!inserted) {
+    NVF_ERROR(default_val_map_[tv]->sameAs(val),
+              "Duplicate setting of default val for ", tv->toString(),
+              ". ", default_val_map_[tv]->toString(), " vs ",
+              val->toString());
+  }
+}
+
+Val* TensorDefaultVal::get(TensorView* tv) const {
+  auto it = default_val_map_.find(tv);
+  if (it != default_val_map_.end()) {
+    return it->second;
+  } else {
+    return nullptr;
+  }
+}
+
+} // namespace nvfuser
@@ -0,0 +1,35 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <dispatch.h>
+
+#include <unordered_map>
+
+namespace nvfuser {
+
+class Fusion;
+class Val;
+class TensorView;
+
+class TensorDefaultVal : public OptOutDispatch {
+ public:
+  TensorDefaultVal(Fusion* fusion);
+  
+  Val* get(TensorView* tv) const;
+
+ private:
+  void handle(ArgsortOp* aop) final;
+
+  void registerDefaultVal(TensorView* tv, Val* val);
+
+ private:
+  std::unordered_map<TensorView*, Val*> default_val_map_;
+};
+
+} // namespace nvfuser
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <compute_at_map.h>
+#include <device_lower/analysis/default_val.h>
 #include <device_lower/analysis/fused_reduction.h>
 #include <device_lower/analysis/padded_parallel_dimensions.h>
 #include <device_lower/analysis/thread_predicate.h>
@@ -114,6 +115,11 @@ class FusionInfo {
 
   FUSION_INFO_DEFINE_FUNCTIONS(IdModel, id_model, idModel);
 
+  FUSION_INFO_DEFINE_FUNCTIONS(
+      TensorDefaultVal,
+      tensor_default_val,
+      tensorDefaultVal);
+
  private:
   FUSION_INFO_DEFINE_FIELD(
       ConcretizedBroadcastDomains,
@@ -132,6 +138,8 @@ class FusionInfo {
   FUSION_INFO_DEFINE_FIELD(ComputeAtMap, ca_map);
 
   FUSION_INFO_DEFINE_FIELD(IdModel, id_model);
+
+  FUSION_INFO_DEFINE_FIELD(TensorDefaultVal, tensor_default_val);
 };
 
 #undef FUSION_INFO_DEFINE_FUNCTIONS
 
@@ -456,6 +456,10 @@ void GpuLower::analysis(Fusion* fusion) {
   replaceSymbolicSizes(fusion_);
   dumpExprsIfEnabled(fusion_->exprs(), "replaceSymbolicSizes");
 
+  // Does not need to be placed here as it has no dependency to any other
+  // analysis.
+  info().set(std::make_unique<TensorDefaultVal>(fusion_));
+
   // New IterDomains may be created, so it is expected that generated
   // code may use diffrent variable names
   if (idModelOptions().buildIdModel()) {
 
@@ -22,37 +22,6 @@
 namespace nvfuser {
 
 namespace {
-// True if a given domain is a loop domain of a given tensor and its
-// loop is partitioned with respect to the memory type of the tensor
-bool isPartitionedLoop(const TensorView* tv, IterDomain* id) {
-  // False if id is not a loop ID
-  if (std::find(tv->getLoopDomain().begin(), tv->getLoopDomain().end(), id) ==
-      tv->getLoopDomain().end()) {
-    return false;
-  }
-
-  // If the memory of this domain is partitioned with respect to the
-  // parallel type of the domain, there's no allocation for the domain
-  return ir_utils::isMemoryPartitionedAcross(
-      tv->getMemoryType(), id->getParallelType());
-}
-
-bool isSizeOneDomain(IterDomain* id) {
-  return id->isBroadcast() || id->extent()->isOneInt();
-}
-
-// True if a given domain of a tensor *may* require allocation
-bool mayRequireAllocation(const TensorView* tv, IterDomain* id) {
-  // Conditions to consider:
-  // - Fully partitioned
-  // - Size one: Allocation is done based on the promotion ID, but as
-  // long as the original ID has size one, its allocation should
-  // remain size one.
-  // - Reduction: Check the original ID, not the promotion, which may
-  //   be a reduction ID even though the original ID is not a reduction
-  return !isPartitionedLoop(tv, id) && !isSizeOneDomain(id) &&
-      !id->isReduction() && !id->isStride();
-}
 
 // Get the allocation stride of a given allocation domain
 Val* getStrideOfGlobalMemoryTensor(TensorView* tv, int64_t alloc_dim) {
@@ -386,7 +355,7 @@ class AllocationDomainSetup : private kir::IrVisitor {
     std::vector<IterDomain*> actual_allocation_ids;
     std::vector<std::optional<bool>> actual_contiguity;
     for (auto [i, id] : enumerate(allocation_domains)) {
-      if (mayRequireAllocation(tv, id)) {
+      if (ir_utils::mayRequireAllocation(tv, id)) {
         actual_allocation_ids.push_back(id);
         actual_contiguity.push_back(contiguity.at(i));
       }
@@ -464,7 +433,7 @@ class AllocationDomainSetup : private kir::IrVisitor {
       auto allocation_domain = allocation_domains.at(dim);
       auto promotion_domain = promoted_allocation_domains.at(dim);
 
-      if (!mayRequireAllocation(tv, allocation_domain)) {
+      if (!ir_utils::mayRequireAllocation(tv, allocation_domain)) {
         continue;
       }
 
@@ -494,7 +463,7 @@ class AllocationDomainSetup : private kir::IrVisitor {
     for (const auto i : arange(allocation_domains.size())) {
       auto allocation_domain = allocation_domains.at(i);
       auto promotion_domain = promoted_allocation_domains.at(i);
-      if (!mayRequireAllocation(tv, allocation_domain)) {
+      if (!ir_utils::mayRequireAllocation(tv, allocation_domain)) {
         continue;
       }
       auto stride = strides.at(i);
@@ -760,7 +729,7 @@ class AllocationDomainSetup : private kir::IrVisitor {
       for (auto out : expr->outputs()) {
         auto it = equiv_domain_set.find(out->as<IterDomain>());
         if (it == equiv_domain_set.end() &&
-            mayRequireAllocation(tv, out->as<IterDomain>())) {
+            ir_utils::mayRequireAllocation(tv, out->as<IterDomain>())) {
           // missing dependency
           return std::nullopt;
         }
@@ -1277,7 +1246,25 @@ class AllocationInserter : public kir::ExprMutator {
 
       auto out_tv = out->as<TensorView>();
       auto default_val =
-          gpu_lower_->predicateElimination().getInitValue(out_tv);
+          FusionInfoGuard::current()->tensorDefaultVal().get(out_tv);
+
+      // Check if out_tv must also be initialized for predicate
+      // elimination. If so, the two initialization values must match
+      if (auto init_for_pred_elimination =
+              gpu_lower_->predicateElimination().getInitValue(out_tv)) {
+        if (default_val != nullptr) {
+          NVF_ERROR(
+              default_val->sameAs(init_for_pred_elimination),
+              "Conflicting default val for ",
+              out_tv->toString(),
+              ". ",
+              default_val->toString(),
+              " vs ",
+              init_for_pred_elimination->toString());
+        } else {
+          default_val = init_for_pred_elimination;
+        }
+      }
 
       Val* init = nullptr;
       if (out_tv->dtype() == DataType::Float4_e2m1fn) {