NVIDIA
diff --git a/‎cudaq/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h‎
Lines changed: 2 additions & 0 deletions b/‎cudaq/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cudaq/lib/Optimizer/Builder/Intrinsics.cpp‎
Lines changed: 2 additions & 0 deletions b/‎cudaq/lib/Optimizer/Builder/Intrinsics.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cudaq/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp‎
Lines changed: 58 additions & 17 deletions b/‎cudaq/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp‎
Lines changed: 58 additions & 17 deletions
diff --git a/‎cudaq/test/Transforms/qir_api_measure_handle.qke‎
Lines changed: 7 additions & 19 deletions b/‎cudaq/test/Transforms/qir_api_measure_handle.qke‎
Lines changed: 7 additions & 19 deletions
diff --git a/‎python/cudaq/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎python/cudaq/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/cudaq/runtime/dem.py‎
Lines changed: 41 additions & 0 deletions b/‎python/cudaq/runtime/dem.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎python/extension/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎python/extension/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/extension/CUDAQuantumExtension.cpp‎
Lines changed: 2 additions & 0 deletions b/‎python/extension/CUDAQuantumExtension.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/runtime/cudaq/analysis/py_dem.cpp‎
Lines changed: 41 additions & 0 deletions b/‎python/runtime/cudaq/analysis/py_dem.cpp‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎python/runtime/cudaq/analysis/py_dem.h‎
Lines changed: 15 additions & 0 deletions b/‎python/runtime/cudaq/analysis/py_dem.h‎
Lines changed: 15 additions & 0 deletions
@@ -19,6 +19,8 @@ static constexpr const char QIRMeasureBody[] = "__quantum__qis__mz__body";
 static constexpr const char QIRMeasure[] = "__quantum__qis__mz";
 static constexpr const char QIRMeasureToRegister[] =
     "__quantum__qis__mz__to__register";
+static constexpr const char QIRMeasureHandleToRegister[] =
+    "__quantum__qis__mz_handle__to__register";
 static constexpr const char QIRResetBody[] = "__quantum__qis__reset__body";
 static constexpr const char QIRReset[] = "__quantum__qis__reset";
 
 
@@ -631,6 +631,8 @@ static constexpr IntrinsicCode intrinsicTable[] = {
   func.func private @__quantum__qis__reset(!qir_qubit)
   func.func private @__quantum__qis__mz(!qir_qubit) -> !qir_result
   func.func private @__quantum__qis__mz__to__register(!qir_qubit, !qir_charptr) -> !qir_result
+  func.func private @__quantum__qis__mz_handle__to__register(!qir_qubit, !qir_charptr) -> i64
+  func.func private @__quantum__qis__read_result__body(!qir_result) -> i1
   func.func private @__quantum__qis__swap(!qir_qubit, !qir_qubit)
   func.func private @__quantum__qis__rx(f64, !qir_qubit)
   func.func private @__quantum__qis__phased_rx(f64, f64, !qir_qubit)
 
@@ -820,6 +820,25 @@ struct DiscriminateOpRewrite
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = disc.getLoc();
     Value m = adaptor.getMeasurement();
+    // Handle-form: the operand is the `i64` chronological measurement
+    // index produced by `mz_handle__to__register`. Round-trip it through
+    // `Result*` and call`read_result__body`, which looks the bit up in
+    // `measRes2Val` under the inttoptr-encoded `Result*` key populated
+    // by`mz_handle__to__register`.
+    if (isa<IntegerType>(m.getType())) {
+      auto ctx = rewriter.getContext();
+      auto resultPtrTy = cudaq::cc::PointerType::get(
+          LLVM::LLVMStructType::getOpaque("Result", ctx));
+      auto resAsPtr = cudaq::cc::CastOp::create(rewriter, loc, resultPtrTy, m);
+      rewriter.replaceOpWithNewOp<func::CallOp>(
+          disc, rewriter.getI1Type(), cudaq::opt::qir0_1::ReadResultBody,
+          ValueRange{resAsPtr});
+      return success();
+    }
+    // Non-handle path: legacy `Result* -> ptr<i1>; load` pattern. Safe
+    // because the only producer of `Result*` outside the handle path is
+    // the sentinel-returning `mz` / `mz__to__register`, where
+    // `Result = bool` and the pointer is dereferenceable.
     auto i1PtrTy = cudaq::cc::PointerType::get(rewriter.getI1Type());
     auto cast = cudaq::cc::CastOp::create(rewriter, loc, i1PtrTy, m);
     rewriter.replaceOpWithNewOp<cudaq::cc::LoadOp>(disc, cast);
@@ -844,12 +863,22 @@ struct DiscriminateOpToCallRewrite
     // expected by the QIR read-result functions.
     SmallVector<Value> operands{adaptor.getOperands().begin(),
                                 adaptor.getOperands().end()};
-    if (operands.size() == 1 && isa<IntegerType>(operands.front().getType())) {
+    const bool operandIsHandle =
+        operands.size() == 1 && isa<IntegerType>(operands.front().getType());
+    if (operandIsHandle) {
       auto resultTy = M::getResultType(rewriter.getContext());
       operands.front() =
           cudaq::cc::CastOp::create(rewriter, loc, resultTy, operands.front());
     }
-    if constexpr (M::discriminateToClassical) {
+    // For handle-form callers, the i64 payload is the chronological
+    // measurement index produced by `mz_handle__to__register`. Loading
+    // through `Result*` as if it were `bool*` (the legacy bitcast+load
+    // pattern below) would dereference an integer-encoded pointer and
+    // segfault — the read-result runtime call is the QIR-spec way to
+    // recover the bit. The `mz_handle__to__register` adapter populates
+    // `measRes2Val` keyed by the index-encoded `Result*`, so the lookup
+    // resolves.
+    if (operandIsHandle || M::discriminateToClassical) {
       if constexpr (M::qirVersion == QirVersion::version_1_0) {
         rewriter.replaceOpWithNewOp<func::CallOp>(
             disc, rewriter.getI1Type(), cudaq::opt::qir1_0::ReadResult,
@@ -1490,20 +1519,38 @@ struct MeasurementOpPattern : public OpConversionPattern<cudaq::quake::MzOp> {
                             adaptor.getTargets().end()};
     auto functionName = M::getQIRMeasure();
 
-    // Handle-form measurements produce a `!cc.measure_handle` SSA value
-    // whose converted type is `i64`. The QIR measurement function still
-    // returns `Result*`, so we bridge the call's `Result*` result to the
-    // converted `i64` payload via `cc.cast`.
+    // Handle-form measurements produce a `!cc.measure_handle` SSA value whose
+    // converted type is `i64`. Route handle-form callers to the sibling runtime
+    // entry
+    // `__quantum__qis__mz_handle__to__register` which returns the chronological
+    // measurement index directly as `i64` (the QIR Base/ Adaptive Profile
+    // convention that the integer encoded in `Result*` identifies the
+    // measurement, see
+    // https://github.com/qir-alliance/qir-spec/blob/1.0/specification/profiles/Base_Profile.md).
     const bool measOutIsHandle =
         isa<cudaq::cc::MeasureHandleType>(mz.getMeasOut().getType());
 
     // Are we using the measurement that returns a result?
     if constexpr (M::mzReturnsResultType) {
-      // Yes, the measurement results the result, so we can use a
-      // straightforward codegen pattern. Use either the mz or the
-      // mz_to_register call (with the name as an extra argument) and forward
-      // the result of the call as the result.
+      // Handle-form gets its own runtime entry that returns `i64` directly.
+      if (measOutIsHandle) {
+        auto cstringGlobal =
+            createGlobalCString(mz, loc, rewriter, regNameAttr.getValue());
+        args.push_back(cstringGlobal);
+        auto i64Ty = rewriter.getI64Type();
+        auto call = func::CallOp::create(
+            rewriter, loc, i64Ty, cudaq::opt::QIRMeasureHandleToRegister, args);
+        call->setAttr(cudaq::opt::QIRRegisterNameAttr, regNameAttr);
+        SmallVector<Value> replaceVals;
+        replaceVals.push_back(call.getResult(0));
+        auto assundry = filterArgs(mz, adaptor.getTargets());
+        replaceVals.append(assundry.begin(), assundry.end());
+        rewriter.replaceOp(mz, replaceVals);
+        return success();
+      }
 
+      // Non-handle path: use the standard mz / mz__to__register call and
+      // forward its `Result*` result unchanged.
       if (mz->getAttr(cudaq::opt::MzAssignedNameAttrName)) {
         functionName = cudaq::opt::QIRMeasureToRegister;
         auto cstringGlobal =
@@ -1515,13 +1562,7 @@ struct MeasurementOpPattern : public OpConversionPattern<cudaq::quake::MzOp> {
           func::CallOp::create(rewriter, loc, resultTy, functionName, args);
       auto assundry = filterArgs(mz, adaptor.getTargets());
       SmallVector<Value> replaceVals;
-      if (measOutIsHandle) {
-        auto i64Ty = rewriter.getI64Type();
-        replaceVals.push_back(
-            cudaq::cc::CastOp::create(rewriter, loc, i64Ty, call.getResult(0)));
-      } else {
-        replaceVals.append(call.getResults().begin(), call.getResults().end());
-      }
+      replaceVals.append(call.getResults().begin(), call.getResults().end());
       replaceVals.append(assundry.begin(), assundry.end());
       rewriter.replaceOp(mz, replaceVals);
       call->setAttr(cudaq::opt::QIRRegisterNameAttr, regNameAttr);
 
@@ -27,16 +27,8 @@ func.func @scalar_handle() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 
 // CHECK-LABEL:   func.func @scalar_handle() -> i1
 // CHECK:           %[[VAL_Q:.*]] = call @__quantum__rt__qubit_allocate()
-// CHECK:           %[[VAL_R:.*]] = call @__quantum__qis__mz__to__register(%[[VAL_Q]], {{%.*}}) {{.*}} -> !cc.ptr<!llvm.struct<"Result", opaque>>
-// `MeasurementOpPattern` casts the QIR call's `Result*` to the converted
-// `i64` payload, and `DiscriminateOpToCallRewrite` casts back to
-// `!cc.ptr<i1>`. The two pointer endpoints (`Result*` and `ptr<i1>`) are
-// joined by an i64 round-trip whose only purpose was the handle ABI; the
-// `FuseCastCascade` `ptr -> int -> ptr` rule collapses the chain into a
-// single pointer cast that lowers to `llvm.bitcast`. This avoids the
-// `llvm.ptrtoint` that the NVQIR profile verifier rejects.
-// CHECK:           %[[VAL_P:.*]] = cc.cast %[[VAL_R]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK:           %[[VAL_B:.*]] = cc.load %[[VAL_P]] : !cc.ptr<i1>
+// CHECK:           %[[VAL_R:.*]] = cc.cast %[[VAL_H]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
+// CHECK:           %[[VAL_B:.*]] = call @__quantum__qis__read_result__body(%[[VAL_R]]) : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> i1
 // CHECK:           return %[[VAL_B]] : i1
 
 // -----
@@ -180,9 +172,7 @@ func.func @handle_stdvec_consume(%v: !cc.stdvec<!cc.measure_handle>) -> i1 attri
 
 // -----
 // End-to-end guard: a follow-up canonicalize pass after convert-to-qir-api
-// must preserve the same Result* -> ptr<i1> collapse. The narrow integer-hop
-// non-fold case is already covered in cast_fold.qke.
-
+// must preserve the handle-form lowering.
 func.func @scalar_handle_e2e() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} {
   %0 = quake.alloca !quake.ref
   %m = quake.mz %0 name "h" : (!quake.ref) -> !cc.measure_handle
@@ -192,12 +182,10 @@ func.func @scalar_handle_e2e() -> i1 attributes {"cudaq-entrypoint", "cudaq-kern
 
 // CHECK-E2E-LABEL:   func.func @scalar_handle_e2e() -> i1
 // CHECK-E2E:           %[[VAL_Q:.*]] = call @__quantum__rt__qubit_allocate()
-// CHECK-E2E:           %[[VAL_R:.*]] = call @__quantum__qis__mz__to__register({{.*}})
-// CHECK-E2E-SAME:        -> !cc.ptr<!llvm.struct<"Result", opaque>>
-// CHECK-E2E:           %[[VAL_P:.*]] = cc.cast %[[VAL_R]] : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> !cc.ptr<i1>
-// CHECK-E2E-NOT:       cc.cast {{.*}} : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> i64
-// CHECK-E2E-NOT:       cc.cast {{.*}} : (i64) -> !cc.ptr<i1>
-// CHECK-E2E:           %[[VAL_B:.*]] = cc.load %[[VAL_P]] : !cc.ptr<i1>
+// CHECK-E2E:           %[[VAL_H:.*]] = call @__quantum__qis__mz_handle__to__register({{.*}}) {{.*}} -> i64
+// CHECK-E2E:           %[[VAL_R:.*]] = cc.cast %[[VAL_H]] : (i64) -> !cc.ptr<!llvm.struct<"Result", opaque>>
+// CHECK-E2E:           %[[VAL_B:.*]] = call @__quantum__qis__read_result__body(%[[VAL_R]]) : (!cc.ptr<!llvm.struct<"Result", opaque>>) -> i1
+// CHECK-E2E-NOT:       cc.load
 // CHECK-E2E:           return %[[VAL_B]] : i1
 
 }
@@ -187,6 +187,7 @@ def _isinstance(other, _cls=cls, _isinst=py_isinstance):
 from .runtime.draw import draw
 from .runtime.unitary import get_unitary
 from .runtime.resource_count import estimate_resources
+from .runtime.dem import dem_from_kernel
 from .runtime.vqe import vqe  # Removed! Use VQE from CUDA-QX
 from .kernel.register_op import register_operation
 from .mlir._mlir_libs._quakeDialects import cudaq_runtime
 
@@ -0,0 +1,41 @@
+# ============================================================================ #
+# Copyright (c) 2025 - 2026 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
+from cudaq.kernel.kernel_decorator import (mk_decorator, isa_kernel_decorator)
+from cudaq.util import trace
+
+
+@trace.traced
+def dem_from_kernel(kernel, *args, noise_model=None):
+    """Generate a detector error model (DEM) from a CUDA-Q kernel.
+
+    Runs `kernel` under the internal `"dem"` execution context, captures
+    the recorded circuit from the backend, and returns Stim's standard
+    `.dem` text via `stim::DetectorErrorModel::str()`. The active CUDA-Q
+    target is unaffected; the analysis simulator is an internal,
+    thread-local override.
+
+    Args:
+      kernel (:class:`Kernel`): The :class:`Kernel` to analyze.
+      *arguments: Concrete argument values forwarded to the kernel invocation.
+      noise_model (:class:`NoiseModel`, optional): Noise model layered on
+          top of any `apply_noise` ops already present in the kernel.
+
+    Returns:
+      UTF-8 string in Stim's standard `.dem` file format. Consumers
+      that need a structured DEM can parse it with
+      `stim.DetectorErrorModel(text)`.
+    """
+    if isa_kernel_decorator(kernel):
+        decorator = kernel
+    else:
+        decorator = mk_decorator(kernel)
+    processedArgs, module = decorator.prepare_call(*args)
+    return cudaq_runtime.dem_from_kernel_impl(decorator.uniqName, module,
+                                              noise_model, *processedArgs)
@@ -107,6 +107,7 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
     ../runtime/cudaq/algorithms/py_sample_async.cpp
     ../runtime/cudaq/algorithms/py_sample_ptsbe.cpp
     ../runtime/cudaq/algorithms/py_resource_count.cpp
+    ../runtime/cudaq/analysis/py_dem.cpp
     ../runtime/cudaq/algorithms/py_run.cpp
     ../../runtime/cudaq/algorithms/run.cpp      # Common Python and C++ implementation of run
     ../runtime/cudaq/algorithms/py_state.cpp
@@ -170,6 +171,7 @@ target_include_directories(CUDAQuantumPythonSources.Extension INTERFACE
 )
 target_link_libraries(CUDAQuantumPythonSources.Extension INTERFACE 
     cudaq 
+    cudaq-analysis
     cudaq-logger 
     cudaq-common 
     cudaq-em-default
 
@@ -28,6 +28,7 @@
 #include "runtime/cudaq/algorithms/py_translate.h"
 #include "runtime/cudaq/algorithms/py_unitary.h"
 #include "runtime/cudaq/algorithms/py_utils.h"
+#include "runtime/cudaq/analysis/py_dem.h"
 #include "runtime/cudaq/operators/py_boson_op.h"
 #include "runtime/cudaq/operators/py_fermion_op.h"
 #include "runtime/cudaq/operators/py_handlers.h"
@@ -134,6 +135,7 @@ NB_MODULE(_quakeDialects, m) {
   bindPyRunAsync(cudaqRuntime);
   bindPyTranslate(cudaqRuntime);
   bindCountResources(cudaqRuntime);
+  bindDemFromKernel(cudaqRuntime);
   bindSampleAsync(cudaqRuntime);
   bindSamplePTSBE(cudaqRuntime);
   bindObserveAsync(cudaqRuntime);
 
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "py_dem.h"
+#include "common/NoiseModel.h"
+#include "runtime/cudaq/platform/py_alt_launch_kernel.h"
+#include "cudaq/algorithms/dem.h"
+#include "cudaq/platform.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <optional>
+#include <string>
+
+using namespace cudaq;
+
+static std::string dem_from_kernel_impl(const std::string &kernelName,
+                                        MlirModule kernelMod,
+                                        std::optional<noise_model> noise,
+                                        nanobind::args args) {
+  auto &platform = cudaq::get_platform();
+  args = simplifiedValidateInputArguments(args);
+
+  const cudaq::noise_model *noisePtr = noise ? &(*noise) : nullptr;
+  return cudaq::details::runDemFromKernel(
+      kernelName, platform, noisePtr, [&]() {
+        [[maybe_unused]] auto result =
+            cudaq::marshal_and_launch_module(kernelName, kernelMod, args);
+      });
+}
+
+void cudaq::bindDemFromKernel(nanobind::module_ &mod) {
+  mod.def("dem_from_kernel_impl", dem_from_kernel_impl, nanobind::arg(),
+          nanobind::arg(), nanobind::arg().none(), nanobind::arg(),
+          "See python documentation for dem_from_kernel.");
+}
@@ -0,0 +1,15 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace cudaq {
+void bindDemFromKernel(nanobind::module_ &mod);
+}