diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
index 5c60161af98..7f2616307d1 100644
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -127,7 +127,7 @@ __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long l
 }
 
 __global__ void
-otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums)
+otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums, uint n_samples)
 {
     const uint n_bins = 256;
 
@@ -137,7 +137,6 @@ otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned
     int bin_idx = threadIdx.x;
     int threshold = blockIdx.x;
 
-    uint n_samples = threshold_sums[0];
     uint n_samples_above = threshold_sums[threshold];
     uint n_samples_below = n_samples - n_samples_above;
 
@@ -149,15 +148,21 @@ otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned
     float threshold_variance_below_f32 = 0;
     if (bin_idx > threshold)
     {
-        float mean = (float) sum_above / n_samples_above;
-        float sigma = bin_idx - mean;
-        threshold_variance_above_f32 = sigma * sigma;
+        if (n_samples_above > 0)
+        {
+            float mean = (float) sum_above / n_samples_above;
+            float sigma = bin_idx - mean;
+            threshold_variance_above_f32 = sigma * sigma;
+        }
     }
     else
     {
-        float mean = (float) sum_below / n_samples_below;
-        float sigma = bin_idx - mean;
-        threshold_variance_below_f32 = sigma * sigma;
+        if (n_samples_below > 0)
+        {
+            float mean = (float) sum_below / n_samples_below;
+            float sigma = bin_idx - mean;
+            threshold_variance_below_f32 = sigma * sigma;
+        }
     }
 
     uint bin_count = histogram[bin_idx];
@@ -198,7 +203,7 @@ __device__ bool has_lowest_score(
 }
 
 __global__ void
-otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
+otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance, uint n_samples)
 {
     const uint n_thresholds = 256;
 
@@ -206,7 +211,6 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
 
     int threshold = threadIdx.x;
 
-    uint n_samples = threshold_sums[0];
     uint n_samples_above = threshold_sums[threshold];
     uint n_samples_below = n_samples - n_samples_above;
 
@@ -241,7 +245,7 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
     }
 }
 
-void compute_otsu(uint *histogram, uint *otsu_threshold, Stream &stream)
+void compute_otsu(uint *histogram, uint *otsu_threshold, uint n_samples, Stream &stream)
 {
     const uint n_bins = 256;
     const uint n_thresholds = 256;
@@ -261,12 +265,12 @@ void compute_otsu(uint *histogram, uint *otsu_threshold, Stream &stream)
     otsu_sums<<<grid_all, block_all, 0, cuda_stream>>>(
         histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
     otsu_variance<<<grid_all, block_all, 0, cuda_stream>>>(
-        gpu_variances.ptr<float2>(), histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>());
+        gpu_variances.ptr<float2>(), histogram, gpu_threshold_sums.ptr<uint>(), gpu_sums.ptr<unsigned long long>(), n_samples);
     otsu_score<<<grid_score, block_score, 0, cuda_stream>>>(
-        otsu_threshold, gpu_threshold_sums.ptr<uint>(), gpu_variances.ptr<float2>());
+        otsu_threshold, gpu_threshold_sums.ptr<uint>(), gpu_variances.ptr<float2>(), n_samples);
 }
 
-// TODO: Replace this is cv::cuda::calcHist
+// TODO: Replace this with cv::cuda::calcHist
 template <uint n_bins>
 __global__ void histogram_kernel(
     uint *histogram, const uint8_t *image, uint width,
@@ -334,7 +338,7 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou
         calcHist(src, gpu_histogram, stream);
 
         GpuMat gpu_otsu_threshold(1, 1, CV_32SC1, pool.getAllocator());
-        compute_otsu(gpu_histogram.ptr<uint>(), gpu_otsu_threshold.ptr<uint>(), stream);
+        compute_otsu(gpu_histogram.ptr<uint>(), gpu_otsu_threshold.ptr<uint>(), src.rows * src.cols, stream);
 
         cv::Mat mat_otsu_threshold;
         gpu_otsu_threshold.download(mat_otsu_threshold, stream);
diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index 1a83c57d6d5..39727ce5073 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -395,6 +395,7 @@ class CV_EXPORTS_W NVSurfaceToColorConverter {
      * @param stream Stream for the asynchronous version.
      */
     CV_WRAP virtual bool convert(InputArray yuv, OutputArray color, const SurfaceFormat surfaceFormat, const ColorFormat outputFormat, const BitDepth bitDepth = BitDepth::UNCHANGED, const bool planar = false, cuda::Stream& stream = cuda::Stream::Null()) = 0;
+    virtual ~NVSurfaceToColorConverter() {};
 };
 
 /** @brief Creates a NVSurfaceToColorConverter.
diff --git a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
index 3cb9452c8ff..6393ba92267 100644
--- a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@@ -118,11 +118,11 @@ class CV_EXPORTS_W DescriptorMatcher : public cv::Algorithm
 
     /** @brief Clears the train descriptor collection.
      */
-    CV_WRAP virtual void clear() = 0;
+    CV_WRAP virtual void clear() CV_OVERRIDE = 0;
 
     /** @brief Returns true if there are no train descriptors in the collection.
      */
-    CV_WRAP virtual bool empty() const = 0;
+    CV_WRAP virtual bool empty() const CV_OVERRIDE = 0;
 
     /** @brief Trains a descriptor matcher.
 
diff --git a/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp b/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
index 4f0a14aafd2..2f86db41762 100644
--- a/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
+++ b/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
@@ -1020,9 +1020,6 @@ void NvidiaOpticalFlowImpl_2::calc(InputArray _frame0, InputArray _frame1, Input
     GpuMat flowXYGpuMat(Size((m_width + m_hwGridSize - 1) / m_hwGridSize,
         (m_height + m_hwGridSize - 1) / m_hwGridSize), CV_16SC2,
         (void*)m_flowXYcuDevPtr, m_outputBufferStrideInfo.strideInfo[0].strideXInBytes);
-    GpuMat flowXYGpuMatUpScaled(Size((m_width + m_gridSize - 1) / m_gridSize,
-        (m_height + m_gridSize - 1) / m_gridSize), CV_16SC2,
-        (void*)m_flowXYUpScaledcuDevPtr, m_outputUpScaledBufferStrideInfo.strideInfo[0].strideXInBytes);
 
     //check whether frame0 is Mat or GpuMat
     if (_frame0.isMat())
@@ -1105,6 +1102,9 @@ void NvidiaOpticalFlowImpl_2::calc(InputArray _frame0, InputArray _frame1, Input
 
     if (m_scaleFactor > 1)
     {
+        GpuMat flowXYGpuMatUpScaled(Size((m_width + m_gridSize - 1) / m_gridSize,
+            (m_height + m_gridSize - 1) / m_gridSize), CV_16SC2,
+            (void*)m_flowXYUpScaledcuDevPtr, m_outputUpScaledBufferStrideInfo.strideInfo[0].strideXInBytes);
         uint32_t nSrcWidth = flowXYGpuMat.size().width;
         uint32_t nSrcHeight = flowXYGpuMat.size().height;
         uint32_t nSrcPitch = m_outputBufferStrideInfo.strideInfo[0].strideXInBytes;
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp
index f60ab5d0eb7..2bcc2fb3aab 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp
@@ -179,7 +179,12 @@ template <class PtrTuple> struct PtrTraits< ZipPtrSz<PtrTuple> > : PtrTraitsBase
 }}
 
 #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4))
+#if (__CUDACC_VER_MAJOR__ > 13 || (__CUDACC_VER_MAJOR__ == 13 && __CUDACC_VER_MINOR__ >= 2))
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+#else
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
+#endif
+
 
 template< class... Types >
 struct tuple_size< cv::cudev::ZipPtr<tuple<Types...> > >
@@ -198,7 +203,11 @@ template<size_t N, class... Types >
 struct tuple_element<N, cv::cudev::ZipPtrSz<tuple<Types...> > >
 : tuple_element<N, tuple<Types...> > { };
 
+#if (__CUDACC_VER_MAJOR__ > 13 || (__CUDACC_VER_MAJOR__ == 13 && __CUDACC_VER_MINOR__ >= 2))
+_CCCL_END_NAMESPACE_CUDA_STD
+#else
 _LIBCUDACXX_END_NAMESPACE_STD
+#endif
 
 #endif
 #endif
diff --git a/modules/ximgproc/include/opencv2/ximgproc/disparity_filter.hpp b/modules/ximgproc/include/opencv2/ximgproc/disparity_filter.hpp
index 7f866d220ee..2e7fce8621f 100644
--- a/modules/ximgproc/include/opencv2/ximgproc/disparity_filter.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc/disparity_filter.hpp
@@ -62,11 +62,14 @@ class CV_EXPORTS_W DisparityFilter : public Algorithm
     @param left_view left view of the original stereo-pair to guide the filtering process, 8-bit single-channel
     or three-channel image.
 
-    @param filtered_disparity_map output disparity map.
+    @param filtered_disparity_map output disparity map, single-channel CV_16S type,
+    with disparity values scaled by 16.
+
 
     @param disparity_map_right optional argument, some implementations might also use the disparity map
-    of the right view to compute confidence maps. If provided, it must be a single-channel CV_32F matrix,
-    otherwise a runtime assertion will fail.
+    of the right view to compute confidence maps. If provided, it must be a single-channel CV_16S matrix.
+    Disparity values are expected to be scaled by 16 (one-pixel disparity corresponds to the value of 16).  
+
 
     @param ROI region of the disparity map to filter. Optional, usually it should be set automatically.