schweitzpgi
diff --git a/‎docs/sphinx/using/backends/sims/tnsims.rst‎
Lines changed: 3 additions & 0 deletions b/‎docs/sphinx/using/backends/sims/tnsims.rst‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎runtime/common/NoiseModel.cpp‎
Lines changed: 1 addition & 1 deletion b/‎runtime/common/NoiseModel.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎runtime/common/NoiseModel.h‎
Lines changed: 1 addition & 1 deletion b/‎runtime/common/NoiseModel.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎runtime/nvqir/cutensornet/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎runtime/nvqir/cutensornet/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎runtime/nvqir/cutensornet/simulator_cutensornet.cpp‎
Lines changed: 213 additions & 7 deletions b/‎runtime/nvqir/cutensornet/simulator_cutensornet.cpp‎
Lines changed: 213 additions & 7 deletions
diff --git a/‎runtime/nvqir/cutensornet/simulator_cutensornet.h‎
Lines changed: 15 additions & 0 deletions b/‎runtime/nvqir/cutensornet/simulator_cutensornet.h‎
Lines changed: 15 additions & 0 deletions
@@ -81,6 +81,9 @@ Specific aspects of the simulation can be configured by setting the following of
 * **`CUDA_VISIBLE_DEVICES=X`**: Makes the process only see GPU X on multi-GPU nodes. Each MPI process must only see its own dedicated GPU. For example, if you run 8 MPI processes on a DGX system with 8 GPUs, each MPI process should be assigned its own dedicated GPU via `CUDA_VISIBLE_DEVICES` when invoking `mpiexec` (or `mpirun`) commands. 
 * **`OMP_PLACES=cores`**: Set this environment variable to improve CPU parallelization.
 * **`OMP_NUM_THREADS=X`**: To enable CPU parallelization, set X to `NUMBER_OF_CORES_PER_NODE/NUMBER_OF_GPUS_PER_NODE`.
+* **`CUDAQ_TENSORNET_CONTROLLED_RANK=X`**: Specify the number of controlled qubits whereby the full tensor body of the controlled gate is expanded. If the number of controlled qubits is greater than this value, the gate is applied as a controlled tensor operator to the tensor network state. Default value is 1.
+* **`CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=X`**: Set this environment variable to `TRUE` (`ON`) or `FALSE` (`OFF`) to enable or disable contraction path reuse when computing expectation values. Default is `OFF`.
+* **`CUDAQ_TENSORNET_NUM_HYPER_SAMPLES=X`**: Specify the number of hyper samples used in the tensor network contraction path finder. Default value is 8 if not specified. 
 
 .. note:: 
 
 
@@ -97,7 +97,7 @@ kraus_channel &kraus_channel::operator=(const kraus_channel &other) {
   return *this;
 }
 
-std::vector<kraus_op> kraus_channel::get_ops() { return ops; }
+std::vector<kraus_op> kraus_channel::get_ops() const { return ops; }
 void kraus_channel::push_back(kraus_op op) { ops.push_back(op); }
 
 void noise_model::add_channel(const std::string &quantumOp,
 
@@ -185,7 +185,7 @@ class kraus_channel {
   kraus_channel &operator=(const kraus_channel &other);
 
   /// @brief Return all kraus_ops in this channel
-  std::vector<kraus_op> get_ops();
+  std::vector<kraus_op> get_ops() const;
 
   /// @brief Add a kraus_op to this channel.
   void push_back(kraus_op op);
 
@@ -60,8 +60,8 @@ set(CUTENSORNET_PATCH ${CMAKE_MATCH_1})
 
 set(CUTENSORNET_VERSION ${CUTENSORNET_MAJOR}.${CUTENSORNET_MINOR}.${CUTENSORNET_PATCH})
 message(STATUS "Found cutensornet version: ${CUTENSORNET_VERSION}")
-# We need cutensornet v2.5.0+
-if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.5")
+# We need cutensornet v2.6.0+ (cutensornetStateApplyUnitaryChannel)
+if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.6")
   set (BASE_TENSOR_BACKEND_SRS  
         simulator_cutensornet.cpp 
         tensornet_spin_op.cpp
 
@@ -11,6 +11,17 @@
 #include "cutensornet.h"
 #include "tensornet_spin_op.h"
 
+namespace {
+const std::vector<std::complex<double>> matPauliI = {
+    {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}};
+const std::vector<std::complex<double>> matPauliX{
+    {0.0, 0.0}, {1.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}};
+const std::vector<std::complex<double>> matPauliY{
+    {0.0, 0.0}, {0.0, -1.0}, {0.0, 1.0}, {0.0, 0.0}};
+const std::vector<std::complex<double>> matPauliZ{
+    {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {-1.0, 0.0}};
+} // namespace
+
 namespace nvqir {
 
 SimulatorTensorNetBase::SimulatorTensorNetBase()
@@ -137,6 +148,201 @@ void SimulatorTensorNetBase::applyGate(const GateApplicationTask &task) {
   }
 }
 
+// Helper to check whether a matrix is a scaled unitary matrix, i.e., `k * U`
+// where U is a unitary matrix. If so, it also returns the `k` factor.
+// Otherwise, return a nullopt.
+template <typename T>
+std::optional<double> isScaledUnitary(const std::vector<std::complex<T>> &mat,
+                                      double eps) {
+  typedef Eigen::Matrix<std::complex<T>, Eigen::Dynamic, Eigen::Dynamic,
+                        Eigen::RowMajor>
+      RowMajorMatTy;
+  const int dim = std::log2(mat.size());
+  Eigen::Map<const RowMajorMatTy> kMat(mat.data(), dim, dim);
+  if (kMat.isZero())
+    return std::nullopt;
+  // Check that (K_dag * K) is a scaled identity matrix
+  // i.e., the K matrix is a scaled unitary.
+  auto kdK = kMat.adjoint() * kMat;
+  if (!kdK.isDiagonal())
+    return std::nullopt;
+  // First element
+  std::complex<T> val = kdK(0, 0);
+  if (std::abs(val) > eps && std::abs(val.imag()) < eps) {
+    auto scaledKdK = (std::complex<T>{1.0} / val) * kdK;
+    if (scaledKdK.isIdentity())
+      return val.real();
+  }
+  return std::nullopt;
+}
+
+std::optional<std::pair<
+    std::vector<double>,
+    std::vector<std::vector<std::complex<
+        double>>>>> static computeUnitaryMixture(const std::
+                                                     vector<std::vector<
+                                                         std::complex<double>>>
+                                                         &krausOps,
+                                                 double tol = 1e-6) {
+  std::vector<double> probs;
+  std::vector<std::vector<std::complex<double>>> mats;
+  const auto scaleMat = [](const std::vector<std::complex<double>> &mat,
+                           double scaleFactor) {
+    std::vector<std::complex<double>> scaledMat = mat;
+    for (auto &x : scaledMat)
+      x /= scaleFactor;
+    return scaledMat;
+  };
+  for (const auto &op : krausOps) {
+    const auto scaledFactor = isScaledUnitary(op, tol);
+    if (!scaledFactor.has_value())
+      return std::nullopt;
+    probs.emplace_back(scaledFactor.value());
+    mats.emplace_back(scaleMat(op, scaledFactor.value()));
+  }
+
+  if (std::abs(1.0 - std::reduce(probs.begin(), probs.end())) > tol)
+    return std::nullopt;
+
+  return std::make_pair(probs, mats);
+}
+
+// Helper to look up a device memory pointer from a  cache.
+// If not found, allocate a new device memory buffer and put it to the cache.
+static void *
+getOrCacheMat(const std::string &key,
+              const std::vector<std::complex<double>> &mat,
+              std::unordered_map<std::string, void *> &gateDeviceMemCache) {
+  const auto iter = gateDeviceMemCache.find(key);
+
+  if (iter == gateDeviceMemCache.end()) {
+    void *dMem = allocateGateMatrix(mat);
+    gateDeviceMemCache[key] = dMem;
+    return dMem;
+  }
+  return iter->second;
+};
+
+void SimulatorTensorNetBase::applyKrausChannel(
+    const std::vector<int32_t> &qubits,
+    const cudaq::kraus_channel &krausChannel) {
+  LOG_API_TIME();
+  switch (krausChannel.noise_type) {
+  case cudaq::noise_model_type::depolarization_channel: {
+    if (krausChannel.parameters.size() != 1)
+      throw std::runtime_error(
+          fmt::format("Invalid parameters for a depolarization channel. "
+                      "Expecting 1 parameter, got {}.",
+                      krausChannel.parameters.size()));
+    const std::vector<void *> channelMats{
+        getOrCacheMat("PauliI", matPauliI, m_gateDeviceMemCache),
+        getOrCacheMat("PauliX", matPauliX, m_gateDeviceMemCache),
+        getOrCacheMat("PauliY", matPauliY, m_gateDeviceMemCache),
+        getOrCacheMat("PauliZ", matPauliZ, m_gateDeviceMemCache)};
+    const double p = krausChannel.parameters[0];
+    const std::vector<double> probabilities = {1 - p, p / 3., p / 3., p / 3.};
+    m_state->applyUnitaryChannel(qubits, channelMats, probabilities);
+    break;
+  }
+  case cudaq::noise_model_type::bit_flip_channel: {
+    if (krausChannel.parameters.size() != 1)
+      throw std::runtime_error(
+          fmt::format("Invalid parameters for a bit-flip channel. "
+                      "Expecting 1 parameter, got {}.",
+                      krausChannel.parameters.size()));
+
+    const std::vector<void *> channelMats{
+        getOrCacheMat("PauliI", matPauliI, m_gateDeviceMemCache),
+        getOrCacheMat("PauliX", matPauliX, m_gateDeviceMemCache)};
+    const double p = krausChannel.parameters[0];
+    const std::vector<double> probabilities = {1 - p, p};
+    m_state->applyUnitaryChannel(qubits, channelMats, probabilities);
+    break;
+  }
+  case cudaq::noise_model_type::phase_flip_channel: {
+    if (krausChannel.parameters.size() != 1)
+      throw std::runtime_error(
+          fmt::format("Invalid parameters for a phase-flip channel. "
+                      "Expecting 1 parameter, got {}.",
+                      krausChannel.parameters.size()));
+
+    const std::vector<void *> channelMats{
+        getOrCacheMat("PauliI", matPauliI, m_gateDeviceMemCache),
+        getOrCacheMat("PauliZ", matPauliZ, m_gateDeviceMemCache)};
+    const double p = krausChannel.parameters[0];
+    const std::vector<double> probabilities = {1 - p, p};
+    m_state->applyUnitaryChannel(qubits, channelMats, probabilities);
+    break;
+  }
+  case cudaq::noise_model_type::amplitude_damping_channel: {
+    if (krausChannel.parameters.size() != 1)
+      throw std::runtime_error(
+          fmt::format("Invalid parameters for a amplitude damping channel. "
+                      "Expecting 1 parameter, got {}.",
+                      krausChannel.parameters.size()));
+    if (krausChannel.parameters[0] != 0.0)
+      throw std::runtime_error("Non-unitary noise channels are not supported.");
+    break;
+  }
+  case cudaq::noise_model_type::unknown: {
+    std::vector<std::vector<std::complex<double>>> mats;
+    for (const auto &op : krausChannel.get_ops())
+      mats.emplace_back(op.data);
+    auto asUnitaryMixture = computeUnitaryMixture(mats);
+    if (asUnitaryMixture.has_value()) {
+      auto &[probabilities, unitaries] = asUnitaryMixture.value();
+      std::vector<void *> channelMats;
+      for (const auto &mat : unitaries)
+        channelMats.emplace_back(getOrCacheMat(
+            "ScaledUnitary_" + std::to_string(vecComplexHash(mat)), mat,
+            m_gateDeviceMemCache));
+      m_state->applyUnitaryChannel(qubits, channelMats, probabilities);
+    } else {
+      throw std::runtime_error("Non-unitary noise channels are not supported.");
+    }
+    break;
+  }
+  default:
+    throw std::runtime_error(
+        "Unsupported noise model type: " +
+        std::to_string(static_cast<int>(krausChannel.noise_type)));
+  }
+}
+
+void SimulatorTensorNetBase::applyNoiseChannel(
+    const std::string_view gateName, const std::vector<std::size_t> &controls,
+    const std::vector<std::size_t> &targets,
+    const std::vector<double> &params) {
+  LOG_API_TIME();
+  // Do nothing if no execution context
+  if (!executionContext)
+    return;
+
+  // Do nothing if no noise model
+  if (!executionContext->noiseModel)
+    return;
+
+  // Get the name as a string
+  std::string gName(gateName);
+  std::vector<int32_t> qubits{controls.begin(), controls.end()};
+  qubits.insert(qubits.end(), targets.begin(), targets.end());
+
+  // Get the Kraus channels specified for this gate and qubits
+  auto krausChannels = executionContext->noiseModel->get_channels(
+      gName, targets, controls, params);
+
+  // If none, do nothing
+  if (krausChannels.empty())
+    return;
+
+  cudaq::info(
+      "[SimulatorTensorNetBase] Applying {} kraus channels on qubits: {}",
+      krausChannels.size(), qubits);
+
+  for (const auto &krausChannel : krausChannels)
+    applyKrausChannel(qubits, krausChannel);
+}
+
 /// @brief Reset the state of a given qubit to zero
 void SimulatorTensorNetBase::resetQubit(const std::size_t qubitIdx) {
   flushGateQueue();
@@ -225,10 +431,7 @@ cudaq::ExecutionResult
 SimulatorTensorNetBase::sample(const std::vector<std::size_t> &measuredBits,
                                const int shots) {
   LOG_API_TIME();
-  std::vector<int32_t> measuredBitIds;
-  std::transform(measuredBits.begin(), measuredBits.end(),
-                 std::back_inserter(measuredBitIds),
-                 [](std::size_t idx) { return static_cast<int32_t>(idx); });
+  std::vector<int32_t> measuredBitIds(measuredBits.begin(), measuredBits.end());
   if (shots < 1) {
     cudaq::spin_op::spin_op_term allZTerm(2 * m_state->getNumQubits(), 0);
     for (const auto &m : measuredBits)
@@ -239,7 +442,8 @@ SimulatorTensorNetBase::sample(const std::vector<std::size_t> &measuredBits,
   }
 
   prepareQubitTensorState();
-  const auto samples = m_state->sample(measuredBitIds, shots);
+  const auto samples =
+      m_state->sample(measuredBitIds, shots, requireCacheWorkspace());
   cudaq::ExecutionResult counts(samples);
   double expVal = 0.0;
   // Compute the expectation value from the counts
@@ -286,7 +490,8 @@ SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) {
     // cutensornetNetworkOperator_t and compute the expectation value.
     TensorNetworkSpinOp spinOp(ham, m_cutnHandle);
     std::complex<double> expVal =
-        m_state->computeExpVal(spinOp.getNetworkOperator());
+        m_state->computeExpVal(spinOp.getNetworkOperator(),
+                               this->executionContext->numberTrajectories);
     expVal += spinOp.getIdentityTermOffset();
     return cudaq::observe_result(expVal.real(), ham,
                                  cudaq::sample_result(cudaq::ExecutionResult(
@@ -316,7 +521,8 @@ SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) {
   });
 
   // Compute the expectation value for all terms
-  const auto termExpVals = m_state->computeExpVals(terms);
+  const auto termExpVals = m_state->computeExpVals(
+      terms, this->executionContext->numberTrajectories);
   std::complex<double> expVal = 0.0;
   // Construct per-term data in the final observe_result
   std::vector<cudaq::ExecutionResult> results;
 
@@ -30,6 +30,12 @@ class SimulatorTensorNetBase : public nvqir::CircuitSimulatorBase<double> {
   /// @brief Apply quantum gate
   void applyGate(const GateApplicationTask &task) override;
 
+  /// @brief Apply a noise channel
+  void applyNoiseChannel(const std::string_view gateName,
+                         const std::vector<std::size_t> &controls,
+                         const std::vector<std::size_t> &targets,
+                         const std::vector<double> &params) override;
+
   // Override base calculateStateDim (we don't instantiate full state vector in
   // the tensornet backend). When the user want to retrieve the state vector, we
   // check if it is feasible to do so.
@@ -88,6 +94,15 @@ class SimulatorTensorNetBase : public nvqir::CircuitSimulatorBase<double> {
   /// @brief Query if direct expectation value calculation is enabled
   virtual bool canHandleObserve() override;
 
+  /// @brief Return true if this simulator can use cache workspace (e.g., for
+  /// intermediate tensors)
+  virtual bool requireCacheWorkspace() const = 0;
+
+private:
+  // Helper to apply a Kraus channel
+  void applyKrausChannel(const std::vector<int32_t> &qubits,
+                         const cudaq::kraus_channel &channel);
+
 protected:
   cutensornetHandle_t m_cutnHandle;
   std::unique_ptr<TensorNetState> m_state;
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ kraus_channel &kraus_channel::operator=(const kraus_channel &other) {`
`97`	`97`	`return *this;`
`98`	`98`	`}`
`99`	`99`
`100`		`-std::vector<kraus_op> kraus_channel::get_ops() { return ops; }`
	`100`	`+std::vector<kraus_op> kraus_channel::get_ops() const { return ops; }`
`101`	`101`	`void kraus_channel::push_back(kraus_op op) { ops.push_back(op); }`
`102`	`102`
`103`	`103`	`void noise_model::add_channel(const std::string &quantumOp,`