Merge: add cuda10.1 generic spmv in benchmark

yhmtsai · web-flow · commit c1ac9f456b3e · 2020-03-03T21:31:37.000+01:00
It adds cuda10.1 generic spmv (Coo, CSR with Algorithm 1/2) in benchmark. Related PR: #468
diff --git a/benchmark/utils/cuda_linops.hpp b/benchmark/utils/cuda_linops.hpp
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/ginkgo.hpp>
 
 
+#include <cuda.h>
 #include <cuda_runtime.h>
 #include <cusparse.h>
 #include <memory>
@@ -45,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/device_guard.hpp"
 #include "cuda/base/pointer_mode_guard.hpp"
+#include "cuda/base/types.hpp"
 
 
 namespace detail {
@@ -54,7 +56,12 @@ class CuspBase : public gko::LinOp {
 public:
     cusparseMatDescr_t get_descr() const { return this->descr_.get(); }
 
-    const gko::CudaExecutor *get_gpu_exec() const { return gpu_exec_.get(); }
+    // Return shared pointer not plain pointer such that CuspGenericSpMV uses
+    // gko::Array to allocate buffer.
+    std::shared_ptr<const gko::CudaExecutor> get_gpu_exec() const
+    {
+        return gpu_exec_;
+    }
 
 protected:
     void apply_impl(const gko::LinOp *, const gko::LinOp *, const gko::LinOp *,
@@ -475,6 +482,204 @@ class CuspHybrid
 };
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
+
+
+template <typename ValueType>
+void cusp_generic_spmv(std::shared_ptr<const gko::CudaExecutor> gpu_exec,
+                       const cusparseSpMatDescr_t mat,
+                       const gko::Array<ValueType> &scalars,
+                       const gko::LinOp *b, gko::LinOp *x,
+                       cusparseOperation_t trans, cusparseSpMVAlg_t alg)
+{
+    cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();
+    using gko::kernels::cuda::as_culibs_type;
+    auto dense_b = gko::as<gko::matrix::Dense<ValueType>>(b);
+    auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
+    auto db = dense_b->get_const_values();
+    auto dx = dense_x->get_values();
+    const auto id = gpu_exec->get_device_id();
+    gko::cuda::device_guard g{id};
+    cusparseDnVecDescr_t vecb, vecx;
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(
+        cusparseCreateDnVec(&vecx, dense_x->get_num_stored_elements(),
+                            as_culibs_type(dx), cu_value));
+    // cusparseCreateDnVec only allows non-const pointer
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateDnVec(
+        &vecb, dense_b->get_num_stored_elements(),
+        as_culibs_type(const_cast<ValueType *>(db)), cu_value));
+
+    size_t buffer_size = 0;
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize(
+        gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0],
+        mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg,
+        &buffer_size));
+    gko::Array<char> buffer_array(gpu_exec, buffer_size);
+    auto dbuffer = buffer_array.get_data();
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV(
+        gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0],
+        mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, dbuffer));
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecx));
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecb));
+}
+
+
+template <typename ValueType = gko::default_precision,
+          typename IndexType = gko::int32,
+          cusparseSpMVAlg_t Alg = CUSPARSE_MV_ALG_DEFAULT>
+class CuspGenericCsr
+    : public gko::EnableLinOp<CuspGenericCsr<ValueType, IndexType, Alg>,
+                              CuspBase>,
+      public gko::EnableCreateMethod<CuspGenericCsr<ValueType, IndexType, Alg>>,
+      public gko::ReadableFromMatrixData<ValueType, IndexType> {
+    friend class gko::EnableCreateMethod<CuspGenericCsr>;
+    friend class gko::EnablePolymorphicObject<CuspGenericCsr, CuspBase>;
+
+public:
+    using csr = gko::matrix::Csr<ValueType, IndexType>;
+    using mat_data = gko::matrix_data<ValueType, IndexType>;
+    cusparseIndexType_t cu_index =
+        gko::kernels::cuda::cusparse_index_type<IndexType>();
+    cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();
+
+    void read(const mat_data &data) override
+    {
+        using gko::kernels::cuda::as_culibs_type;
+        csr_->read(data);
+        this->set_size(gko::dim<2>{csr_->get_size()});
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseCreateCsr(&mat_, csr_->get_size()[0], csr_->get_size()[1],
+                              csr_->get_num_stored_elements(),
+                              as_culibs_type(csr_->get_row_ptrs()),
+                              as_culibs_type(csr_->get_col_idxs()),
+                              as_culibs_type(csr_->get_values()), cu_index,
+                              cu_index, CUSPARSE_INDEX_BASE_ZERO, cu_value));
+    }
+
+    gko::size_type get_num_stored_elements() const noexcept
+    {
+        return csr_->get_num_stored_elements();
+    }
+
+    ~CuspGenericCsr() override
+    {
+        const auto id = this->get_gpu_exec()->get_device_id();
+        try {
+            gko::cuda::device_guard g{id};
+            GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_));
+        } catch (const std::exception &e) {
+            std::cerr << "Error when unallocating CuspGenericCsr mat_ matrix: "
+                      << e.what() << std::endl;
+        }
+    }
+
+    CuspGenericCsr(const CuspGenericCsr &other) = delete;
+
+    CuspGenericCsr &operator=(const CuspGenericCsr &other) = default;
+
+protected:
+    void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override
+    {
+        cusp_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_,
+                          Alg);
+    }
+
+    CuspGenericCsr(std::shared_ptr<const gko::Executor> exec,
+                   const gko::dim<2> &size = gko::dim<2>{})
+        : gko::EnableLinOp<CuspGenericCsr, CuspBase>(exec, size),
+          csr_(std::move(
+              csr::create(exec, std::make_shared<typename csr::classical>()))),
+          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+    {}
+
+private:
+    // Contains {alpha, beta}
+    gko::Array<ValueType> scalars{
+        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
+    std::shared_ptr<csr> csr_;
+    cusparseOperation_t trans_;
+    cusparseSpMatDescr_t mat_;
+};
+
+
+template <typename ValueType = gko::default_precision,
+          typename IndexType = gko::int32>
+class CuspGenericCoo
+    : public gko::EnableLinOp<CuspGenericCoo<ValueType, IndexType>, CuspBase>,
+      public gko::EnableCreateMethod<CuspGenericCoo<ValueType, IndexType>>,
+      public gko::ReadableFromMatrixData<ValueType, IndexType> {
+    friend class gko::EnableCreateMethod<CuspGenericCoo>;
+    friend class gko::EnablePolymorphicObject<CuspGenericCoo, CuspBase>;
+
+public:
+    using coo = gko::matrix::Coo<ValueType, IndexType>;
+    using mat_data = gko::matrix_data<ValueType, IndexType>;
+    cusparseIndexType_t cu_index =
+        gko::kernels::cuda::cusparse_index_type<IndexType>();
+    cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();
+
+    void read(const mat_data &data) override
+    {
+        using gko::kernels::cuda::as_culibs_type;
+        coo_->read(data);
+        this->set_size(gko::dim<2>{coo_->get_size()});
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(
+            cusparseCreateCoo(&mat_, coo_->get_size()[0], coo_->get_size()[1],
+                              coo_->get_num_stored_elements(),
+                              as_culibs_type(coo_->get_row_idxs()),
+                              as_culibs_type(coo_->get_col_idxs()),
+                              as_culibs_type(coo_->get_values()), cu_index,
+                              CUSPARSE_INDEX_BASE_ZERO, cu_value));
+    }
+
+    gko::size_type get_num_stored_elements() const noexcept
+    {
+        return coo_->get_num_stored_elements();
+    }
+
+    ~CuspGenericCoo() override
+    {
+        const auto id = this->get_gpu_exec()->get_device_id();
+        try {
+            gko::cuda::device_guard g{id};
+            GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_));
+        } catch (const std::exception &e) {
+            std::cerr << "Error when unallocating CuspGenericCoo mat_ matrix: "
+                      << e.what() << std::endl;
+        }
+    }
+
+    CuspGenericCoo(const CuspGenericCoo &other) = delete;
+
+    CuspGenericCoo &operator=(const CuspGenericCoo &other) = default;
+
+protected:
+    void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override
+    {
+        cusp_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_,
+                          CUSPARSE_MV_ALG_DEFAULT);
+    }
+
+    CuspGenericCoo(std::shared_ptr<const gko::Executor> exec,
+                   const gko::dim<2> &size = gko::dim<2>{})
+        : gko::EnableLinOp<CuspGenericCoo, CuspBase>(exec, size),
+          coo_(std::move(coo::create(exec))),
+          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+    {}
+
+private:
+    // Contains {alpha, beta}
+    gko::Array<ValueType> scalars{
+        this->get_executor(), {gko::one<ValueType>(), gko::zero<ValueType>()}};
+    std::shared_ptr<coo> coo_;
+    cusparseOperation_t trans_;
+    cusparseSpMatDescr_t mat_;
+};
+
+
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
+
+
 }  // namespace detail
 
 
@@ -485,6 +690,18 @@ using cusp_csrmp = detail::CuspCsrmp<>;
 using cusp_csrmm = detail::CuspCsrmm<>;
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
+
+
+using cusp_gcsr = detail::CuspGenericCsr<>;
+using cusp_gcsr2 =
+    detail::CuspGenericCsr<double, gko::int32, CUSPARSE_CSRMV_ALG2>;
+using cusp_gcoo = detail::CuspGenericCoo<>;
+
+
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
+
+
 using cusp_coo =
     detail::CuspHybrid<double, gko::int32, CUSPARSE_HYB_PARTITION_USER, 0>;
 using cusp_ell =
diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
@@ -97,8 +97,18 @@ std::string format_description =
     "cusp_csrex: benchmark CuSPARSE with the cusparseXcsrmvEx function.\n"
     "cusp_csrmp: benchmark CuSPARSE with the cusparseXcsrmv_mp function.\n"
     "cusp_csrmm: benchmark CuSPARSE with the cusparseXcsrmv_mm function."
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
+    "\n"
+    "cusp_gcsr: benchmark CuSPARSE with the generic csr with default "
+    "algorithm.\n"
+    "cusp_gcsr2: benchmark CuSPARSE with the generic csr with "
+    "CUSPARSE_CSRMV_ALG2.\n"
+    "cusp_gcoo: benchmark CuSPARSE with the generic coo with default "
+    "algorithm.\n"
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
 #endif  // HAS_CUDA
 #ifdef HAS_HIP
+    "\n"
     "hipsp_csr: benchmark HipSPARSE with the hipsparseXcsrmv function.\n"
     "hipsp_csrmm: benchmark HipSPARSE with the hipsparseXcsrmv_mm function.\n"
     "hipsp_hybrid: benchmark HipSPARSE spmv with hipsparseXhybmv and an "
@@ -163,6 +173,7 @@ std::unique_ptr<MatrixType> read_matrix_from_data(
     }
 
 
+// clang-format off
 const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
                                 std::shared_ptr<const gko::Executor>,
                                 const gko::matrix_data<> &)>>
@@ -181,6 +192,11 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"cusp_hybrid", read_matrix_from_data<cusp_hybrid>},
         {"cusp_coo", read_matrix_from_data<cusp_coo>},
         {"cusp_ell", read_matrix_from_data<cusp_ell>},
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
+        {"cusp_gcsr", read_matrix_from_data<cusp_gcsr>},
+        {"cusp_gcsr2", read_matrix_from_data<cusp_gcsr2>},
+        {"cusp_gcoo", read_matrix_from_data<cusp_gcoo>},
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
 #endif  // HAS_CUDA
 #ifdef HAS_HIP
         {"hipsp_csr", read_matrix_from_data<hipsp_csr>},
@@ -216,6 +232,7 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
          READ_MATRIX(hybrid,
                      std::make_shared<hybrid::minimal_storage_limit>())},
         {"sellp", read_matrix_from_data<gko::matrix::Sellp<>>}};
+// clang-format on
 
 
 }  // namespace formats
diff --git a/common/components/atomic.hpp.inc b/common/components/atomic.hpp.inc
@@ -85,7 +85,7 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
 
 
-#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10100))
+#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010))
 // CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
 #endif
diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp
@@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <cublas_v2.h>
+#include <cusparse.h>
 #include <thrust/complex.h>
 
 
@@ -190,6 +191,31 @@ constexpr cudaDataType_t cuda_data_type_impl<uint8>()
 }
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
+
+
+template <typename T>
+constexpr cusparseIndexType_t cusparse_index_type_impl()
+{
+    return CUSPARSE_INDEX_16U;
+}
+
+template <>
+constexpr cusparseIndexType_t cusparse_index_type_impl<int32>()
+{
+    return CUSPARSE_INDEX_32I;
+}
+
+template <>
+constexpr cusparseIndexType_t cusparse_index_type_impl<int64>()
+{
+    return CUSPARSE_INDEX_64I;
+}
+
+
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
+
+
 }  // namespace detail
 
 
@@ -208,6 +234,27 @@ constexpr cudaDataType_t cuda_data_type()
 }
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
+
+
+/**
+ * This is an alias for the `cudaIndexType_t` equivalent of `T`. By default,
+ * CUSPARSE_INDEX_16U is returned.
+ *
+ * @tparam T  a type
+ *
+ * @returns the actual `cusparseIndexType_t`
+ */
+template <typename T>
+constexpr cusparseIndexType_t cusparse_index_type()
+{
+    return detail::cusparse_index_type_impl<T>();
+}
+
+
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010)
+
+
 /**
  * This is an alias for CUDA's equivalent of `T`.
  *