From dca058a2c1988920f9b60e1f303c4fbf6aad4c4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ch=C4=99tkowski?= Date: Sat, 3 Aug 2024 12:26:02 +0200 Subject: [PATCH 01/19] Refactor --- include/dr/mp/halo.hpp | 431 +------------------------------- include/dr/mp/halo/format.hpp | 19 ++ include/dr/mp/halo/group.hpp | 160 ++++++++++++ include/dr/mp/halo/halo.hpp | 182 ++++++++++++++ include/dr/mp/halo/instance.hpp | 98 ++++++++ test/gtest/mp/CMakeLists.txt | 124 ++++----- 6 files changed, 528 insertions(+), 486 deletions(-) create mode 100644 include/dr/mp/halo/format.hpp create mode 100644 include/dr/mp/halo/group.hpp create mode 100644 include/dr/mp/halo/halo.hpp create mode 100644 include/dr/mp/halo/instance.hpp diff --git a/include/dr/mp/halo.hpp b/include/dr/mp/halo.hpp index 7f7b7dbdb1..53cabb853c 100644 --- a/include/dr/mp/halo.hpp +++ b/include/dr/mp/halo.hpp @@ -4,430 +4,7 @@ #pragma once -#include -#include - -namespace dr::mp { - -enum class halo_tag { - invalid, - forward, - reverse, - index, -}; - -template class halo_impl { - using T = typename Group::element_type; - using Memory = typename Group::memory_type; - -public: - using group_type = Group; - - // Destructor frees buffer_, so cannot copy - halo_impl(const halo_impl &) = delete; - halo_impl operator=(const halo_impl &) = delete; - - /// halo constructor - halo_impl(communicator comm, const std::vector &owned_groups, - const std::vector &halo_groups, - const Memory &memory = Memory()) - : comm_(comm), halo_groups_(halo_groups), owned_groups_(owned_groups), - memory_(memory) { - DRLOG("Halo constructed with {}/{} owned/halo", rng::size(owned_groups), - rng::size(halo_groups)); - buffer_size_ = 0; - std::size_t i = 0; - std::vector buffer_index; - for (auto &g : owned_groups_) { - buffer_index.push_back(buffer_size_); - g.request_index = i++; - buffer_size_ += g.buffer_size(); - map_.push_back(&g); - } - for (auto &g : halo_groups_) { - buffer_index.push_back(buffer_size_); - g.request_index = i++; - buffer_size_ += g.buffer_size(); - map_.push_back(&g); - } - buffer_ = memory_.allocate(buffer_size_); - assert(buffer_ != nullptr); - i = 0; - for (auto &g : owned_groups_) { - g.buffer = &buffer_[buffer_index[i++]]; - } - for (auto &g : halo_groups_) { - g.buffer = &buffer_[buffer_index[i++]]; - } - requests_.resize(i); - } - - /// Begin a halo exchange - void exchange_begin() { - DRLOG("Halo exchange receiving"); - receive(halo_groups_); - DRLOG("Halo exchange sending"); - send(owned_groups_); - DRLOG("Halo exchange begin finished"); - } - - /// Complete a halo exchange - void exchange_finalize() { - DRLOG("Halo exchange finalize started"); - reduce_finalize(); - DRLOG("Halo exchange finalize finished"); - } - - void exchange() { - exchange_begin(); - exchange_finalize(); - } - - /// Begin a halo reduction - void reduce_begin() { - receive(owned_groups_); - send(halo_groups_); - } - - /// Complete a halo reduction - void reduce_finalize(const auto &op) { - for (int pending = rng::size(requests_); pending > 0; pending--) { - int completed; - MPI_Waitany(rng::size(requests_), requests_.data(), &completed, - MPI_STATUS_IGNORE); - DRLOG("reduce_finalize(op) waitany completed: {}", completed); - auto &g = *map_[completed]; - if (g.receive && g.buffered) { - g.unpack(op); - } - } - } - - /// Complete a halo reduction - void reduce_finalize() { - for (int pending = rng::size(requests_); pending > 0; pending--) { - int completed; - MPI_Waitany(rng::size(requests_), requests_.data(), &completed, - MPI_STATUS_IGNORE); - DRLOG("reduce_finalize() waitany completed: {}", completed); - auto &g = *map_[completed]; - if (g.receive && g.buffered) { - g.unpack(); - } - } - } - - struct second_op { - T operator()(T &a, T &b) const { return b; } - } second; - - struct plus_op { - T operator()(T &a, T &b) const { return a + b; } - } plus; - - struct max_op { - T operator()(T &a, T &b) const { return std::max(a, b); } - } max; - - struct min_op { - T operator()(T &a, T &b) const { return std::min(a, b); } - } min; - - struct multiplies_op { - T operator()(T &a, T &b) const { return a * b; } - } multiplies; - - ~halo_impl() { - if (buffer_) { - memory_.deallocate(buffer_, buffer_size_); - buffer_ = nullptr; - } - } - -private: - void send(std::vector &sends) { - for (auto &g : sends) { - g.pack(); - g.receive = false; - DRLOG("sending: {}", g.request_index); - comm_.isend(g.data_pointer(), g.data_size(), g.rank(), g.tag(), - &requests_[g.request_index]); - } - } - - void receive(std::vector &receives) { - for (auto &g : receives) { - g.receive = true; - DRLOG("receiving: {}", g.request_index); - comm_.irecv(g.data_pointer(), g.data_size(), g.rank(), g.tag(), - &requests_[g.request_index]); - } - } - - communicator comm_; - std::vector halo_groups_, owned_groups_; - T *buffer_ = nullptr; - std::size_t buffer_size_; - std::vector requests_; - std::vector map_; - Memory memory_; -}; - -template > class index_group { -public: - using element_type = T; - using memory_type = Memory; - T *buffer = nullptr; - std::size_t request_index; - bool receive; - bool buffered; - - /// Constructor - index_group(T *data, std::size_t rank, - const std::vector &indices, const Memory &memory) - : memory_(memory), data_(data), rank_(rank) { - buffered = false; - for (std::size_t i = 0; i < rng::size(indices) - 1; i++) { - buffered = buffered || (indices[i + 1] - indices[i] != 1); - } - indices_size_ = rng::size(indices); - indices_ = memory_.template allocate(indices_size_); - assert(indices_ != nullptr); - memory_.memcpy(indices_, indices.data(), - indices_size_ * sizeof(std::size_t)); - } - - index_group(const index_group &o) - : buffer(o.buffer), request_index(o.request_index), receive(o.receive), - buffered(o.buffered), memory_(o.memory_), data_(o.data_), - rank_(o.rank_), indices_size_(o.indices_size_), tag_(o.tag_) { - indices_ = memory_.template allocate(indices_size_); - assert(indices_ != nullptr); - memory_.memcpy(indices_, o.indices_, indices_size_ * sizeof(std::size_t)); - } - - void unpack(const auto &op) { - T *dpt = data_; - auto n = indices_size_; - auto *ipt = indices_; - auto *b = buffer; - memory_.offload([=]() { - for (std::size_t i = 0; i < n; i++) { - dpt[ipt[i]] = op(dpt[ipt[i]], b[i]); - } - }); - } - - void pack() { - T *dpt = data_; - auto n = indices_size_; - auto *ipt = indices_; - auto *b = buffer; - memory_.offload([=]() { - for (std::size_t i = 0; i < n; i++) { - b[i] = dpt[ipt[i]]; - } - }); - } - - std::size_t buffer_size() { - if (buffered) { - return indices_size_; - } - return 0; - } - - T *data_pointer() { - if (buffered) { - return buffer; - } else { - return &data_[indices_[0]]; - } - } - - std::size_t data_size() { return indices_size_; } - - std::size_t rank() { return rank_; } - auto tag() { return tag_; } - - ~index_group() { - if (indices_) { - memory_.template deallocate(indices_, indices_size_); - indices_ = nullptr; - } - } - -private: - Memory memory_; - T *data_ = nullptr; - std::size_t rank_; - std::size_t indices_size_; - std::size_t *indices_; - halo_tag tag_ = halo_tag::index; -}; - -template -using unstructured_halo_impl = halo_impl>; - -template > -class unstructured_halo : public unstructured_halo_impl { -public: - using group_type = index_group; - using index_map = std::pair>; - - /// - /// Constructor - /// - unstructured_halo(communicator comm, T *data, - const std::vector &owned, - const std::vector &halo, - const Memory &memory = Memory()) - : unstructured_halo_impl( - comm, make_groups(comm, data, owned, memory), - make_groups(comm, data, halo, memory), memory) {} - -private: - static std::vector make_groups(communicator comm, T *data, - const std::vector &map, - const Memory &memory) { - std::vector groups; - for (auto const &[rank, indices] : map) { - groups.emplace_back(data, rank, indices, memory); - } - return groups; - } -}; - -template > class span_group { -public: - using element_type = T; - using memory_type = Memory; - T *buffer = nullptr; - std::size_t request_index = 0; - bool receive = false; - bool buffered = false; - - span_group(std::span data, std::size_t rank, halo_tag tag) - : data_(data), rank_(rank), tag_(tag) { -#ifdef SYCL_LANGUAGE_VERSION - if (use_sycl() && sycl_mem_kind() == sycl::usm::alloc::shared) { - buffered = true; - } -#endif - } - - void unpack() { - if (buffered) { - if (mp::use_sycl()) { - __detail::sycl_copy(buffer, buffer + rng::size(data_), data_.data()); - } else { - std::copy(buffer, buffer + rng::size(data_), data_.data()); - } - } - } - - void pack() { - if (buffered) { - if (mp::use_sycl()) { - __detail::sycl_copy(data_.data(), data_.data() + rng::size(data_), - buffer); - } else { - std::copy(data_.begin(), data_.end(), buffer); - } - } - } - std::size_t buffer_size() { return rng::size(data_); } - - std::size_t data_size() { return rng::size(data_); } - - T *data_pointer() { - if (buffered) { - return buffer; - } else { - return data_.data(); - } - } - - std::size_t rank() { return rank_; } - - auto tag() { return tag_; } - -private: - Memory memory_; - std::span data_; - std::size_t rank_; - halo_tag tag_ = halo_tag::invalid; -}; - -struct halo_bounds { - std::size_t prev = 0, next = 0; - bool periodic = false; -}; - -template -using span_halo_impl = halo_impl>; - -template > -class span_halo : public span_halo_impl { -public: - using group_type = span_group; - - span_halo() : span_halo_impl(communicator(), {}, {}) {} - - span_halo(communicator comm, T *data, std::size_t size, halo_bounds hb) - : span_halo_impl(comm, owned_groups(comm, {data, size}, hb), - halo_groups(comm, {data, size}, hb)) { - check(size, hb); - } - - span_halo(communicator comm, std::span span, halo_bounds hb) - : span_halo_impl(comm, owned_groups(comm, span, hb), - halo_groups(comm, span, hb)) {} - -private: - void check(auto size, auto hb) { - assert(size >= hb.prev + hb.next + std::max(hb.prev, hb.next)); - } - - static std::vector - owned_groups(communicator comm, std::span span, halo_bounds hb) { - std::vector owned; - DRLOG("owned groups {}/{} first/last", comm.first(), comm.last()); - if (hb.next > 0 && (hb.periodic || !comm.first())) { - owned.emplace_back(span.subspan(hb.prev, hb.next), comm.prev(), - halo_tag::reverse); - } - if (hb.prev > 0 && (hb.periodic || !comm.last())) { - owned.emplace_back( - span.subspan(rng::size(span) - (hb.prev + hb.next), hb.prev), - comm.next(), halo_tag::forward); - } - return owned; - } - - static std::vector - halo_groups(communicator comm, std::span span, halo_bounds hb) { - std::vector halo; - if (hb.prev > 0 && (hb.periodic || !comm.first())) { - halo.emplace_back(span.first(hb.prev), comm.prev(), halo_tag::forward); - } - if (hb.next > 0 && (hb.periodic || !comm.last())) { - halo.emplace_back(span.last(hb.next), comm.next(), halo_tag::reverse); - } - return halo; - } -}; - -} // namespace dr::mp - -#ifdef DR_FORMAT - -template <> -struct fmt::formatter : formatter { - template - auto format(dr::mp::halo_bounds hb, FmtContext &ctx) { - return fmt::format_to(ctx.out(), "prev: {} next: {}", hb.prev, hb.next); - } -}; - -#endif +#include "halo/halo.hpp" +#include "halo/group.hpp" +#include "halo/instance.hpp" +#include "halo/format.hpp" diff --git a/include/dr/mp/halo/format.hpp b/include/dr/mp/halo/format.hpp new file mode 100644 index 0000000000..6c329ae63c --- /dev/null +++ b/include/dr/mp/halo/format.hpp @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include + +#ifdef DR_FORMAT + +template <> +struct fmt::formatter : formatter { + template + auto format(dr::mp::halo_bounds hb, FmtContext &ctx) { + return fmt::format_to(ctx.out(), "prev: {} next: {}", hb.prev, hb.next); + } +}; + +#endif diff --git a/include/dr/mp/halo/group.hpp b/include/dr/mp/halo/group.hpp new file mode 100644 index 0000000000..bcf24727ad --- /dev/null +++ b/include/dr/mp/halo/group.hpp @@ -0,0 +1,160 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include + +namespace dr::mp { + + + template > + class index_group { + public: + using element_type = T; + using memory_type = Memory; + T *buffer = nullptr; + std::size_t request_index; + bool receive; + bool buffered; + + /// Constructor + index_group(T *data, std::size_t rank, + const std::vector &indices, const Memory &memory) + : memory_(memory), data_(data), rank_(rank) { + buffered = false; + for (std::size_t i = 0; i < rng::size(indices) - 1; i++) { + buffered = buffered || (indices[i + 1] - indices[i] != 1); + } + indices_size_ = rng::size(indices); + indices_ = memory_.template allocate(indices_size_); + assert(indices_ != nullptr); + memory_.memcpy(indices_, indices.data(), + indices_size_ * sizeof(std::size_t)); + } + + index_group(const index_group &o) + : buffer(o.buffer), request_index(o.request_index), receive(o.receive), + buffered(o.buffered), memory_(o.memory_), data_(o.data_), + rank_(o.rank_), indices_size_(o.indices_size_), tag_(o.tag_) { + indices_ = memory_.template allocate(indices_size_); + assert(indices_ != nullptr); + memory_.memcpy(indices_, o.indices_, indices_size_ * sizeof(std::size_t)); + } + + void unpack(const auto &op) { + T *dpt = data_; + auto n = indices_size_; + auto *ipt = indices_; + auto *b = buffer; + memory_.offload([=]() { + for (std::size_t i = 0; i < n; i++) { + dpt[ipt[i]] = op(dpt[ipt[i]], b[i]); + } + }); + } + + void pack() { + T *dpt = data_; + auto n = indices_size_; + auto *ipt = indices_; + auto *b = buffer; + memory_.offload([=]() { + for (std::size_t i = 0; i < n; i++) { + b[i] = dpt[ipt[i]]; + } + }); + } + + std::size_t buffer_size() { + if (buffered) { + return indices_size_; + } + return 0; + } + + T *data_pointer() { + if (buffered) { + return buffer; + } else { + return &data_[indices_[0]]; + } + } + + std::size_t data_size() { return indices_size_; } + + std::size_t rank() { return rank_; } + auto tag() { return tag_; } + + ~index_group() { + if (indices_) { + memory_.template deallocate(indices_, indices_size_); + indices_ = nullptr; + } + } + + private: + Memory memory_; + T *data_ = nullptr; + std::size_t rank_; + std::size_t indices_size_; + std::size_t *indices_; + halo_tag tag_ = halo_tag::index; + }; + + template > class span_group { + public: + using element_type = T; + using memory_type = Memory; + T *buffer = nullptr; + std::size_t request_index = 0; + bool receive = false; + bool buffered = false; + + span_group(std::span data, std::size_t rank, halo_tag tag) + : data_(data), rank_(rank), tag_(tag) { +#ifdef SYCL_LANGUAGE_VERSION + if (mp::use_sycl() && mp::sycl_mem_kind() == sycl::usm::alloc::shared) { + buffered = true; + } +#endif + } + + /// If span is buffered, push buffer to data + void unpack() { + if (buffered) { + __detail::sycl_copy(buffer, buffer + rng::size(data_), data_.data()); + } + } + + /// If span is buffered, pull data into buffer + void pack() { + if (buffered) { + __detail::sycl_copy(data_.data(), data_.data() + rng::size(data_), buffer); + } + } + + std::size_t buffer_size() { return rng::size(data_); } + + std::size_t data_size() { return rng::size(data_); } + + T *data_pointer() { + if (buffered) { + return buffer; + } else { + return data_.data(); + } + } + + std::size_t rank() { return rank_; } + + auto tag() { return tag_; } + + private: + std::span data_; + std::size_t rank_; + halo_tag tag_ = halo_tag::invalid; + }; +} \ No newline at end of file diff --git a/include/dr/mp/halo/halo.hpp b/include/dr/mp/halo/halo.hpp new file mode 100644 index 0000000000..ce4f6305aa --- /dev/null +++ b/include/dr/mp/halo/halo.hpp @@ -0,0 +1,182 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include + +namespace dr::mp { + + enum class halo_tag { + invalid, + forward, + reverse, + index, + }; + + struct halo_bounds { + std::size_t prev = 0, next = 0; + bool periodic = false; + }; + + template + class halo_impl { + using T = typename Group::element_type; + using Memory = typename Group::memory_type; + + public: + using group_type = Group; + + // Destructor frees buffer_, so cannot copy + halo_impl(const halo_impl &) = delete; + + halo_impl operator=(const halo_impl &) = delete; + + /// halo constructor + halo_impl(communicator comm, const std::vector &owned_groups, + const std::vector &halo_groups, + const Memory &memory = Memory()) + : comm_(comm), halo_groups_(halo_groups), owned_groups_(owned_groups), + memory_(memory) { + DRLOG("Halo constructed with {}/{} owned/halo", rng::size(owned_groups), + rng::size(halo_groups)); + buffer_size_ = 0; + std::size_t i = 0; + std::vector buffer_index; + for (auto &g: owned_groups_) { + buffer_index.push_back(buffer_size_); + g.request_index = i++; + buffer_size_ += g.buffer_size(); + map_.push_back(&g); + } + for (auto &g: halo_groups_) { + buffer_index.push_back(buffer_size_); + g.request_index = i++; + buffer_size_ += g.buffer_size(); + map_.push_back(&g); + } + buffer_ = memory_.allocate(buffer_size_); + assert(buffer_ != nullptr); + i = 0; + for (auto &g: owned_groups_) { + g.buffer = &buffer_[buffer_index[i++]]; + } + for (auto &g: halo_groups_) { + g.buffer = &buffer_[buffer_index[i++]]; + } + requests_.resize(i); + } + + /// Begin a halo exchange + void exchange_begin() { + DRLOG("Halo exchange receiving"); + receive(halo_groups_); + DRLOG("Halo exchange sending"); + send(owned_groups_); + DRLOG("Halo exchange begin finished"); + } + + /// Complete a halo exchange + void exchange_finalize() { + DRLOG("Halo exchange finalize started"); + reduce_finalize(); + DRLOG("Halo exchange finalize finished"); + } + + void exchange() { + exchange_begin(); + exchange_finalize(); + } + + /// Begin a halo reduction + void reduce_begin() { + receive(owned_groups_); + send(halo_groups_); + } + + /// Complete a halo reduction + void reduce_finalize(const auto &op) { + for (int pending = rng::size(requests_); pending > 0; pending--) { + int completed; + MPI_Waitany(rng::size(requests_), requests_.data(), &completed, + MPI_STATUS_IGNORE); + DRLOG("reduce_finalize(op) waitany completed: {}", completed); + auto &g = *map_[completed]; + if (g.receive && g.buffered) { + g.unpack(op); + } + } + } + + /// Complete a halo reduction + void reduce_finalize() { + for (int pending = rng::size(requests_); pending > 0; pending--) { + int completed; + MPI_Waitany(rng::size(requests_), requests_.data(), &completed, + MPI_STATUS_IGNORE); + DRLOG("reduce_finalize() waitany completed: {}", completed); + auto &g = *map_[completed]; + if (g.receive && g.buffered) { + g.unpack(); + } + } + } + + struct second_op { + T operator()(T &a, T &b) const { return b; } + } second; + + struct plus_op { + T operator()(T &a, T &b) const { return a + b; } + } plus; + + struct max_op { + T operator()(T &a, T &b) const { return std::max(a, b); } + } max; + + struct min_op { + T operator()(T &a, T &b) const { return std::min(a, b); } + } min; + + struct multiplies_op { + T operator()(T &a, T &b) const { return a * b; } + } multiplies; + + ~halo_impl() { + if (buffer_) { + memory_.deallocate(buffer_, buffer_size_); + buffer_ = nullptr; + } + } + + private: + void send(std::vector &sends) { + for (auto &g: sends) { + g.pack(); + g.receive = false; + DRLOG("sending: {}", g.request_index); + comm_.isend(g.data_pointer(), g.data_size(), g.rank(), g.tag(), + &requests_[g.request_index]); + } + } + + void receive(std::vector &receives) { + for (auto &g: receives) { + g.receive = true; + DRLOG("receiving: {}", g.request_index); + comm_.irecv(g.data_pointer(), g.data_size(), g.rank(), g.tag(), + &requests_[g.request_index]); + } + } + + communicator comm_; + std::vector halo_groups_, owned_groups_; + T *buffer_ = nullptr; + std::size_t buffer_size_; + std::vector requests_; + std::vector map_; + Memory memory_; + }; +} diff --git a/include/dr/mp/halo/instance.hpp b/include/dr/mp/halo/instance.hpp new file mode 100644 index 0000000000..5fdb4555a1 --- /dev/null +++ b/include/dr/mp/halo/instance.hpp @@ -0,0 +1,98 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include +#include "halo.hpp" +#include "group.hpp" + +namespace dr::mp { +template +using unstructured_halo_impl = halo_impl >; + +template> +class unstructured_halo : public unstructured_halo_impl { +public: + using group_type = index_group; + using index_map = std::pair >; + + /// + /// Constructor + /// + unstructured_halo(communicator comm, T *data, + const std::vector &owned, + const std::vector &halo, + const Memory &memory = Memory()) + : unstructured_halo_impl( + comm, make_groups(comm, data, owned, memory), + make_groups(comm, data, halo, memory), memory) {} + +private: + static std::vector make_groups(communicator comm, T *data, + const std::vector &map, + const Memory &memory) { + std::vector groups; + for (auto const &[rank, indices]: map) { + groups.emplace_back(data, rank, indices, memory); + } + return groups; + } +}; + +template +using span_halo_impl = halo_impl >; + +template> +class span_halo : public span_halo_impl { +public: + using group_type = span_group; + + span_halo() : span_halo_impl(communicator(), {}, {}) {} + + span_halo(communicator comm, T *data, std::size_t size, halo_bounds hb) + : span_halo_impl(comm, owned_groups(comm, {data, size}, hb), + halo_groups(comm, {data, size}, hb)) { + check(size, hb); + } + + span_halo(communicator comm, std::span span, halo_bounds hb) + : span_halo_impl(comm, owned_groups(comm, span, hb), + halo_groups(comm, span, hb)) {} + +private: + void check(auto size, auto hb) { + assert(size >= hb.prev + hb.next + std::max(hb.prev, hb.next)); + } + + static std::vector + owned_groups(communicator comm, std::span span, halo_bounds hb) { + std::vector owned; + DRLOG("owned groups {}/{} first/last", comm.first(), comm.last()); + if (hb.next > 0 && (hb.periodic || !comm.first())) { + owned.emplace_back(span.subspan(hb.prev, hb.next), comm.prev(), + halo_tag::reverse); + } + if (hb.prev > 0 && (hb.periodic || !comm.last())) { + owned.emplace_back( + span.subspan(rng::size(span) - (hb.prev + hb.next), hb.prev), + comm.next(), halo_tag::forward); + } + return owned; + } + + static std::vector + halo_groups(communicator comm, std::span span, halo_bounds hb) { + std::vector halo; + if (hb.prev > 0 && (hb.periodic || !comm.first())) { + halo.emplace_back(span.first(hb.prev), comm.prev(), halo_tag::forward); + } + if (hb.next > 0 && (hb.periodic || !comm.last())) { + halo.emplace_back(span.last(hb.next), comm.next(), halo_tag::reverse); + } + return halo; + } +}; +} \ No newline at end of file diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index cef65af431..a300d47d1e 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -7,69 +7,74 @@ set(CMAKE_INCLUDE_CURRENT_DIR ON) # tested with a variable number of ranks # cmake-format: off add_executable( - mp-tests - mp-tests.cpp - ../common/all.cpp - ../common/copy.cpp - ../common/counted.cpp - ../common/distributed_vector.cpp - ../common/drop.cpp - ../common/enumerate.cpp - ../common/equal.cpp - ../common/exclusive_scan.cpp - ../common/fill.cpp - ../common/for_each.cpp - ../common/inclusive_scan.cpp - ../common/iota.cpp - ../common/iota_view.cpp - ../common/reduce.cpp - ../common/sort.cpp - ../common/subrange.cpp - ../common/sycl_utils.cpp - ../common/take.cpp - ../common/transform.cpp - ../common/transform_view.cpp - ../common/zip.cpp - ../common/zip_local.cpp - alignment.cpp - communicator.cpp - copy.cpp - distributed_vector.cpp - halo.cpp - mdstar.cpp - mpsort.cpp - reduce.cpp - stencil.cpp - segments.cpp - slide_view.cpp - wave_kernel.cpp) + mp-tests + mp-tests.cpp + ../common/all.cpp + ../common/copy.cpp + ../common/counted.cpp + ../common/distributed_vector.cpp + ../common/drop.cpp + ../common/enumerate.cpp + ../common/equal.cpp + ../common/exclusive_scan.cpp + ../common/fill.cpp + ../common/for_each.cpp + ../common/inclusive_scan.cpp + ../common/iota.cpp + ../common/iota_view.cpp + ../common/reduce.cpp + ../common/sort.cpp + ../common/subrange.cpp + ../common/sycl_utils.cpp + ../common/take.cpp + ../common/transform.cpp + ../common/transform_view.cpp + ../common/zip.cpp + ../common/zip_local.cpp + alignment.cpp + communicator.cpp + copy.cpp + distributed_vector.cpp + halo.cpp + mdstar.cpp + mpsort.cpp + reduce.cpp + stencil.cpp + segments.cpp + slide_view.cpp + wave_kernel.cpp) add_executable( - mp-tests-3 - mp-tests.cpp - communicator-3.cpp - halo-3.cpp - slide_view-3.cpp) + mp-tests-3 + mp-tests.cpp + communicator-3.cpp + halo-3.cpp + slide_view-3.cpp) -# mp-quick-test is for development. By reducing the number of source files, it +# mp-quick-test and mp-quick-test-3-only is for development. By reducing the number of source files, it # builds much faster. Change the source files to match what you need to test. It # is OK to commit changes to the source file list. add_executable(mp-quick-test - mp-tests.cpp - ../common/equal.cpp - ) + mp-tests.cpp + halo.cpp +) +add_executable(mp-quick-test-3-only + mp-tests.cpp + halo-3.cpp +) # cmake-format: on target_compile_definitions(mp-quick-test PRIVATE QUICK_TEST) +target_compile_definitions(mp-quick-test-3-only PRIVATE QUICK_TEST) -foreach(test-exec IN ITEMS mp-tests mp-tests-3 mp-quick-test) +foreach(test-exec IN ITEMS mp-tests mp-tests-3 mp-quick-test mp-quick-test-3-only) if(ENABLE_ISHMEM) target_link_ishmem(${test-exec}) endif() target_link_libraries(${test-exec} GTest::gtest_main cxxopts DR::mpi) set_property(TARGET ${test-exec} PROPERTY RULE_LAUNCH_COMPILE - "${CMAKE_COMMAND} -E time") + "${CMAKE_COMMAND} -E time") endforeach() # tests without --sycl flag will fail on IshmemBackend TODO: make them be @@ -77,6 +82,7 @@ endforeach() if(NOT ENABLE_ISHMEM) add_mp_ctest(NAME mp-quick-test NPROC 1) add_mp_ctest(NAME mp-quick-test NPROC 2) + add_mp_ctest(NAME mp-quick-test-3-only NPROC 3) cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL) @@ -87,8 +93,7 @@ if(NOT ENABLE_ISHMEM) foreach(nproc RANGE 2 4) add_mp_ctest(NAME mp-tests NPROC ${nproc} TIMEOUT 150) endforeach() - add_mp_ctest( - TEST_NAME mp-tests-3-only NAME mp-tests-3 NPROC 3 TIMEOUT 150) + add_mp_ctest(TEST_NAME mp-tests-3-only NAME mp-tests-3 NPROC 3 TIMEOUT 150) endif() if(ENABLE_SYCL) @@ -102,32 +107,33 @@ if(ENABLE_SYCL) # equality of these values: *(--counted_result.end()) Which is: 5, should be # 77 Mdspan, Mdarray hangs sometimes on ISHMEM. set(sycl-exclusions - ${sycl-exclusions}Halo3/*:Sort*:Counted/*:Mdspan*:Mdarray*:) + ${sycl-exclusions}Halo3/*:Sort*:Counted/*:Mdspan*:Mdarray*:) endif() foreach(nproc RANGE 1 4) add_mp_ctest(NAME mp-quick-test NPROC ${nproc} SYCL) - add_mp_ctest( - NAME mp-quick-test NPROC ${nproc} OFFLOAD SYCL TARGS --device-memory) + add_mp_ctest(NAME mp-quick-test NPROC ${nproc} OFFLOAD SYCL TARGS --device-memory) endforeach() + add_mp_ctest(NAME mp-quick-test-3-only NPROC 3 SYCL) + add_mp_ctest(NAME mp-quick-test-3-only NPROC 3 OFFLOAD SYCL TARGS --device-memory) add_mp_ctest( - NAME mp-tests NPROC 2 TIMEOUT 150 OFFLOAD SYCL TARGS --device-memory - --gtest_filter=-${sycl-exclusions}) + NAME mp-tests NPROC 2 TIMEOUT 150 OFFLOAD SYCL TARGS --device-memory + --gtest_filter=-${sycl-exclusions}) if(NOT MPI_IMPL STREQUAL "openmpi") # MPI_Win_create fails for communicator with size 1 add_mp_ctest( - NAME mp-tests NPROC 1 SYCL TARGS --gtest_filter=-${sycl-exclusions}) + NAME mp-tests NPROC 1 SYCL TARGS --gtest_filter=-${sycl-exclusions}) endif() # TODO: fix sycl Slide issues, see # https://github.com/oneapi-src/distributed-ranges/issues/322 foreach(nproc RANGE 2 4) add_mp_ctest( - NAME mp-tests NPROC ${nproc} TIMEOUT 150 SYCL TARGS - --gtest_filter=-${sycl-exclusions}) + NAME mp-tests NPROC ${nproc} TIMEOUT 150 SYCL TARGS + --gtest_filter=-${sycl-exclusions}) endforeach() add_mp_ctest( - TEST_NAME mp-tests-sycl-3-only NAME mp-tests-3 NPROC 3 TIMEOUT 150 SYCL - TARGS --gtest_filter=-${sycl-exclusions}) + TEST_NAME mp-tests-sycl-3-only NAME mp-tests-3 NPROC 3 TIMEOUT 150 SYCL + TARGS --gtest_filter=-${sycl-exclusions}) endif() From 97f429b77c72de36e8040e2f0729e12c091cffc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ch=C4=99tkowski?= Date: Thu, 15 Aug 2024 21:49:31 +0200 Subject: [PATCH 02/19] First tests --- benchmarks/gbench/mp/CMakeLists.txt | 12 +- benchmarks/gbench/mp/wave_equation-wide.cpp | 801 ++++++++++++++++++++ benchmarks/gbench/mp/wave_equation.cpp | 31 + include/dr/mp/halo/halo.hpp | 3 + test/gtest/mp/halo-3.cpp | 69 ++ 5 files changed, 914 insertions(+), 2 deletions(-) create mode 100644 benchmarks/gbench/mp/wave_equation-wide.cpp diff --git a/benchmarks/gbench/mp/CMakeLists.txt b/benchmarks/gbench/mp/CMakeLists.txt index c3ae2f682c..24d50a1e50 100644 --- a/benchmarks/gbench/mp/CMakeLists.txt +++ b/benchmarks/gbench/mp/CMakeLists.txt @@ -80,13 +80,21 @@ add_executable(wave_equation wave_equation.cpp) target_link_libraries(wave_equation cxxopts DR::mpi) target_compile_definitions(wave_equation PRIVATE STANDALONE_BENCHMARK) add_mp_ctest(NAME wave_equation) +add_executable(wave_equation-wide wave_equation-wide.cpp) +target_link_libraries(wave_equation-wide cxxopts DR::mpi) +target_compile_definitions(wave_equation-wide PRIVATE STANDALONE_BENCHMARK) +add_mp_ctest(NAME wave_equation-wide) # add_mp_ctest(TEST_NAME wave_equation_fused NAME wave_equation TARGS -f) # # DRA-92 if(ENABLE_SYCL) add_mp_ctest( - TEST_NAME wave_equation-sycl NAME wave_equation NPROC 2 SYCL) + TEST_NAME wave_equation-sycl NAME wave_equation TIMEOUT 200 NPROC 8 SYCL) add_mp_ctest( - TEST_NAME wave_equation_fused-sycl NAME wave_equation NPROC 2 SYCL TARGS -f) + TEST_NAME wave_equation_fused-sycl NAME wave_equation TIMEOUT 200 NPROC 2 SYCL TARGS -f) + add_mp_ctest( + TEST_NAME wave_equation-wide-sycl NAME wave_equation-wide TIMEOUT 200 NPROC 8 SYCL) + add_mp_ctest( + TEST_NAME wave_equation-wide_fused-sycl NAME wave_equation-wide TIMEOUT 200 NPROC 2 SYCL TARGS -f) endif() add_executable(shallow_water shallow_water.cpp) diff --git a/benchmarks/gbench/mp/wave_equation-wide.cpp b/benchmarks/gbench/mp/wave_equation-wide.cpp new file mode 100644 index 0000000000..744a7503c9 --- /dev/null +++ b/benchmarks/gbench/mp/wave_equation-wide.cpp @@ -0,0 +1,801 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "cxxopts.hpp" +#include "dr/mp.hpp" +#include "mpi.h" +#include "wave_utils.hpp" +#include +#include + +#ifdef STANDALONE_BENCHMARK + +MPI_Comm comm; +int comm_rank; +int comm_size; + +#else + +#include "../common/dr_bench.hpp" + +#endif + +namespace WaveEquation { + +using T = double; +using Array = dr::mp::distributed_mdarray; + +// gravitational acceleration +constexpr double g = 9.81; +// water depth +constexpr double h = 1.0; + +// Get number of read/write bytes and flops for a single time step +// These numbers correspond to the fused kernel version +void calculate_complexity(std::size_t nx, std::size_t ny, std::size_t &nread, + std::size_t &nwrite, std::size_t &nflop) { + // stage1: 2+2+3 = 7 + // stage2: 3+3+4 = 10 + // stage3: 3+3+4 = 10 + nread = (27 * nx * ny) * sizeof(T); + // stage1: 3 + // stage2: 3 + // stage3: 3 + nwrite = (9 * nx * ny) * sizeof(T); + // stage1: 3+3+6 = 12 + // stage2: 6+6+9 = 21 + // stage3: 6+6+9 = 21 + nflop = 54 * nx * ny; +} + +double exact_elev(double x, double y, double t, double lx, double ly) { + /** + * Exact solution for elevation field. + * + * Returns time-dependent elevation of a 2D standing wave in a + * rectangular domain. + */ + double amp = 0.5; + double c = std::sqrt(g * h); + std::size_t n = 1; + double sol_x = std::cos(2 * n * M_PI * x / lx); + std::size_t m = 1; + double sol_y = std::cos(2 * m * M_PI * y / ly); + double omega = c * M_PI * std::hypot(n / lx, m / ly); + double sol_t = std::cos(2 * omega * t); + return amp * sol_x * sol_y * sol_t; +} + +double initial_elev(double x, double y, double lx, double ly) { + return exact_elev(x, y, 0.0, lx, ly); +} + +void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, + double g, double h, double dx_inv, double dy_inv, double dt, unsigned long redundancy) { + /** + * Evaluate right hand side of the equations + */ + auto rhs_dedx = [dt, g, dx_inv](auto v) { + auto [in, out] = v; + out(0, 0) = -dt * g * (in(1, 0) - in(0, 0)) * dx_inv; + }; + { + std::array start{1 + redundancy, 0}; + std::array end{e.extent(0) - 1 - redundancy, e.extent(1)}; + auto e_view = dr::mp::views::submdspan(e.view(), start, end); + auto dudt_view = dr::mp::views::submdspan(dudt.view(), start, end); + dr::mp::stencil_for_each(rhs_dedx, e_view, dudt_view); + } + + auto rhs_dedy = [dt, g, dy_inv](auto v) { + auto [in, out] = v; + out(0, 0) = -dt * g * (in(0, 0) - in(0, -1)) * dy_inv; + }; + { + std::array start{0 + redundancy, 1}; + std::array end{e.extent(0) - redundancy, e.extent(1)}; + auto e_view = dr::mp::views::submdspan(e.view(), start, end); + auto dvdt_view = dr::mp::views::submdspan(dvdt.view(), start, end); + dr::mp::stencil_for_each(rhs_dedy, e_view, dvdt_view); + } + + auto rhs_div = [dt, h, dx_inv, dy_inv](auto args) { + auto [u, v, out] = args; + auto dudx = (u(0, 0) - u(-1, 0)) * dx_inv; + auto dvdy = (v(0, 1) - v(0, 0)) * dy_inv; + out(0, 0) = -dt * h * (dudx + dvdy); + }; + { + std::array start{1 + redundancy, 0}; + std::array end{u.extent(0) - redundancy, u.extent(1)}; + auto u_view = dr::mp::views::submdspan(u.view(), start, end); + auto v_view = dr::mp::views::submdspan(v.view(), start, end); + auto dedt_view = dr::mp::views::submdspan(dedt.view(), start, end); + dr::mp::stencil_for_each(rhs_div, u_view, v_view, dedt_view); + } +}; + +void stage1(Array &u, Array &v, Array &e, Array &u1, Array &v1, Array &e1, + double g, double h, double dx_inv, double dy_inv, double dt) { + /** + * Evaluate stage 1 of the RK time stepper + * + * u1 = u + dt*rhs(u) + * + */ + // u: elevation x gradient + dr::mp::halo(e).exchange_finalize(); + auto rhs_u1 = [dt, g, dx_inv](auto tuple) { + auto [e, u, out] = tuple; + auto dedx = (e(1, 0) - e(0, 0)) * dx_inv; + out(0, 0) = u(0, 0) - dt * g * dedx; + }; + { + std::array start{1, 0}; + std::array end{e.extent(0) - 1, e.extent(1)}; + auto e_view = dr::mp::views::submdspan(e.view(), start, end); + auto u_view = dr::mp::views::submdspan(u.view(), start, end); + auto u1_view = dr::mp::views::submdspan(u1.view(), start, end); + dr::mp::stencil_for_each(rhs_u1, e_view, u_view, u1_view); + } + dr::mp::halo(u1).exchange_begin(); + + // v: elevation y gradient + auto rhs_v1 = [dt, g, dy_inv](auto tuple) { + auto [e, v, out] = tuple; + auto dedy = (e(0, 0) - e(0, -1)) * dy_inv; + out(0, 0) = v(0, 0) - dt * g * dedy; + }; + { + std::array start{0, 1}; + std::array end{e.extent(0), e.extent(1)}; + auto e_view = dr::mp::views::submdspan(e.view(), start, end); + auto v_view = dr::mp::views::submdspan(v.view(), start, end); + auto v1_view = dr::mp::views::submdspan(v1.view(), start, end); + dr::mp::stencil_for_each(rhs_v1, e_view, v_view, v1_view); + } + dr::mp::halo(v1).exchange_begin(); + + // e: divergence of (u, v) + dr::mp::halo(u).exchange_finalize(); + dr::mp::halo(v).exchange_finalize(); + auto rhs_e1 = [dt, h, dx_inv, dy_inv](auto tuple) { + auto [e, u, v, out] = tuple; + auto dudx = (u(0, 0) - u(-1, 0)) * dx_inv; + auto dvdy = (v(0, 1) - v(0, 0)) * dy_inv; + out(0, 0) = e(0, 0) - dt * h * (dudx + dvdy); + }; + { + std::array start{1, 0}; + std::array end{u.extent(0), u.extent(1)}; + auto e_view = dr::mp::views::submdspan(e.view(), start, end); + auto u_view = dr::mp::views::submdspan(u.view(), start, end); + auto v_view = dr::mp::views::submdspan(v.view(), start, end); + auto e1_view = dr::mp::views::submdspan(e1.view(), start, end); + dr::mp::stencil_for_each(rhs_e1, e_view, u_view, v_view, e1_view); + } + dr::mp::halo(e1).exchange_begin(); +}; + +void stage2(Array &u, Array &v, Array &e, Array &u1, Array &v1, Array &e1, + Array &u2, Array &v2, Array &e2, double g, double h, double dx_inv, + double dy_inv, double dt) { + /** + * Evaluate stage 2 of the RK time stepper + * + * u2 = 0.75*u + 0.25*(u1 + dt*rhs(u1)) + * + */ + // u: elevation x gradient + dr::mp::halo(e1).exchange_finalize(); + auto rhs_u2 = [dt, g, dx_inv](auto tuple) { + auto [e1, u1, u, out] = tuple; + auto dedx = (e1(1, 0) - e1(0, 0)) * dx_inv; + out(0, 0) = 0.75 * u(0, 0) + 0.25 * (u1(0, 0) - dt * g * dedx); + }; + { + std::array start{1, 0}; + std::array end{e.extent(0) - 1, e.extent(1)}; + auto e1_view = dr::mp::views::submdspan(e1.view(), start, end); + auto u1_view = dr::mp::views::submdspan(u1.view(), start, end); + auto u_view = dr::mp::views::submdspan(u.view(), start, end); + auto u2_view = dr::mp::views::submdspan(u2.view(), start, end); + dr::mp::stencil_for_each(rhs_u2, e1_view, u1_view, u_view, u2_view); + } + dr::mp::halo(u2).exchange_begin(); + + // v: elevation y gradient + auto rhs_v2 = [dt, g, dy_inv](auto tuple) { + auto [e1, v1, v, out] = tuple; + auto dedy = (e1(0, 0) - e1(0, -1)) * dy_inv; + out(0, 0) = 0.75 * v(0, 0) + 0.25 * (v1(0, 0) - dt * g * dedy); + }; + { + std::array start{0, 1}; + std::array end{e.extent(0), e.extent(1)}; + auto e1_view = dr::mp::views::submdspan(e1.view(), start, end); + auto v1_view = dr::mp::views::submdspan(v1.view(), start, end); + auto v_view = dr::mp::views::submdspan(v.view(), start, end); + auto v2_view = dr::mp::views::submdspan(v2.view(), start, end); + dr::mp::stencil_for_each(rhs_v2, e1_view, v1_view, v_view, v2_view); + } + dr::mp::halo(v2).exchange_begin(); + + // e: divergence of (u, v) + dr::mp::halo(u1).exchange_finalize(); + dr::mp::halo(v1).exchange_finalize(); + auto rhs_e2 = [dt, h, dx_inv, dy_inv](auto tuple) { + auto [e1, u1, v1, e, out] = tuple; + auto dudx = (u1(0, 0) - u1(-1, 0)) * dx_inv; + auto dvdy = (v1(0, 1) - v1(0, 0)) * dy_inv; + out(0, 0) = 0.75 * e(0, 0) + 0.25 * (e1(0, 0) - dt * h * (dudx + dvdy)); + }; + { + std::array start{1, 0}; + std::array end{u.extent(0), u.extent(1)}; + auto e1_view = dr::mp::views::submdspan(e1.view(), start, end); + auto u1_view = dr::mp::views::submdspan(u1.view(), start, end); + auto v1_view = dr::mp::views::submdspan(v1.view(), start, end); + auto e_view = dr::mp::views::submdspan(e.view(), start, end); + auto e2_view = dr::mp::views::submdspan(e2.view(), start, end); + dr::mp::stencil_for_each(rhs_e2, e1_view, u1_view, v1_view, e_view, + e2_view); + } + dr::mp::halo(e2).exchange_begin(); +}; + +void stage3(Array &u, Array &v, Array &e, Array &u2, Array &v2, Array &e2, + double g, double h, double dx_inv, double dy_inv, double dt) { + /** + * Evaluate stage 3 of the RK time stepper + * + * u3 = 1/3*u + 2/3*(u2 + dt*rhs(u2)) + * + */ + // u: elevation x gradient + dr::mp::halo(e2).exchange_finalize(); + auto rhs_u3 = [dt, g, dx_inv](auto tuple) { + auto [e2, u2, out] = tuple; + auto dedx = (e2(1, 0) - e2(0, 0)) * dx_inv; + out(0, 0) *= 1.0 / 3; + out(0, 0) += 2.0 / 3 * (u2(0, 0) - dt * g * dedx); + }; + { + std::array start{1, 0}; + std::array end{e.extent(0) - 1, e.extent(1)}; + auto e2_view = dr::mp::views::submdspan(e2.view(), start, end); + auto u2_view = dr::mp::views::submdspan(u2.view(), start, end); + auto u_view = dr::mp::views::submdspan(u.view(), start, end); + dr::mp::stencil_for_each(rhs_u3, e2_view, u2_view, u_view); + } + dr::mp::halo(u).exchange_begin(); + + // v: elevation y gradient + auto rhs_v3 = [dt, g, dy_inv](auto tuple) { + auto [e2, v2, out] = tuple; + auto dedy = (e2(0, 0) - e2(0, -1)) * dy_inv; + out(0, 0) *= 1.0 / 3; + out(0, 0) += 2.0 / 3 * (v2(0, 0) - dt * g * dedy); + }; + { + std::array start{0, 1}; + std::array end{e.extent(0), e.extent(1)}; + auto e2_view = dr::mp::views::submdspan(e2.view(), start, end); + auto v2_view = dr::mp::views::submdspan(v2.view(), start, end); + auto v_view = dr::mp::views::submdspan(v.view(), start, end); + dr::mp::stencil_for_each(rhs_v3, e2_view, v2_view, v_view); + } + dr::mp::halo(v).exchange_begin(); + + // e: divergence of (u, v) + dr::mp::halo(u2).exchange_finalize(); + dr::mp::halo(v2).exchange_finalize(); + auto rhs_e3 = [dt, h, dx_inv, dy_inv](auto tuple) { + auto [e2, u2, v2, out] = tuple; + auto dudx = (u2(0, 0) - u2(-1, 0)) * dx_inv; + auto dvdy = (v2(0, 1) - v2(0, 0)) * dy_inv; + out(0, 0) *= 1.0 / 3; + out(0, 0) += 2.0 / 3 * (e2(0, 0) - dt * h * (dudx + dvdy)); + }; + { + std::array start{1, 0}; + std::array end{u.extent(0), u.extent(1)}; + auto e2_view = dr::mp::views::submdspan(e2.view(), start, end); + auto u2_view = dr::mp::views::submdspan(u2.view(), start, end); + auto v2_view = dr::mp::views::submdspan(v2.view(), start, end); + auto e_view = dr::mp::views::submdspan(e.view(), start, end); + dr::mp::stencil_for_each(rhs_e3, e2_view, u2_view, v2_view, e_view); + } + dr::mp::halo(e).exchange_begin(); +}; + +//#define DEBUG + +#ifdef DEBUG +void debug_print_arr(std::size_t n, std::size_t m, const Array& arr, const std::string& str) { + std::cout << "Array " << str << ":\n"; + std::cout << arr << "\n"; +} +#endif + +int run( + int n, bool benchmark_mode, bool fused_kernels, + std::function iter_callback = []() {}) { + // construct grid + // number of cells in x, y direction + std::size_t nx = n; + std::size_t ny = n; + const double xmin = -1, xmax = 1; + const double ymin = -1, ymax = 1; + ArakawaCGrid grid(xmin, xmax, ymin, ymax, nx, ny); + + std::size_t halo_radius = 1 * 2; // 1 - halo size, 2 - redundancy + auto dist = dr::mp::distribution().halo(halo_radius); + + // statistics + std::size_t nread, nwrite, nflop; + calculate_complexity(nx, ny, nread, nwrite, nflop); + + if (comm_rank == 0) { + std::cout << "Using backend: dr" << std::endl; + if (fused_kernels) { + std::cout << "Using fused kernels" << std::endl; + } + std::cout << "Grid size: " << nx << " x " << ny << std::endl; + std::cout << "Elevation DOFs: " << nx * ny << std::endl; + std::cout << "Velocity DOFs: " << (nx + 1) * ny + nx * (ny + 1) + << std::endl; + std::cout << "Total DOFs: " << nx * ny + (nx + 1) * ny + nx * (ny + 1); + std::cout << std::endl; + } + + // compute time step + double t_end = 1.0; + double t_export = 0.02; + + double c = std::sqrt(g * h); + double alpha = 0.5; + double dt = alpha * std::min(grid.dx, grid.dy) / c; + dt = t_export / static_cast(ceil(t_export / dt)); + std::size_t nt = static_cast(ceil(t_end / dt)); + if (benchmark_mode) { + nt = 100; + dt = 1e-5; + t_export = 25 * dt; + t_end = nt * dt; + } + if (comm_rank == 0) { + std::cout << "Time step: " << dt << " s" << std::endl; + std::cout << "Total run time: " << std::fixed << std::setprecision(1); + std::cout << t_end << " s, "; + std::cout << nt << " time steps" << std::endl; + } + + // state variables + // water elevation at T points + Array e({nx + 1, ny}, dist); + dr::mp::fill(e, 0.0); + // x velocity at U points + Array u({nx + 1, ny}, dist); + dr::mp::fill(u, 0.0); + // y velocity at V points + Array v({nx + 1, ny + 1}, dist); + dr::mp::fill(v, 0.0); + + // state for RK stages + Array e1({nx + 1, ny}, dist); + Array u1({nx + 1, ny}, dist); + Array v1({nx + 1, ny + 1}, dist); + Array e2({nx + 1, ny}, dist); + Array u2({nx + 1, ny}, dist); + Array v2({nx + 1, ny + 1}, dist); + + // time tendencies + // NOTE not needed if rhs kernels are fused with RK stage assignment + Array dedt({nx + 1, ny}, dist); + Array dudt({nx + 1, ny}, dist); + Array dvdt({nx + 1, ny + 1}, dist); + + // TODO: figure out smaller views for each of arrays from above + // First phase runs on normal arrays + // Second phase runs on smaller arrays (smaller only by 1 (redundancy)), but only on smaller in first dimension + std::array smaller_view_start_e{1, 0}; + std::array smaller_view_end_e{e.extent(0) - 1, e.extent(1)}; + std::array smaller_view_start_u{1, 0}; + std::array smaller_view_end_u{u.extent(0) - 1, u.extent(1)}; + std::array smaller_view_start_v{1, 0}; + std::array smaller_view_end_v{v.extent(0) - 1, v.extent(1)}; + auto e_smaller_view = dr::mp::views::submdspan(e.view(), smaller_view_start_e, smaller_view_end_e); + auto u_smaller_view = dr::mp::views::submdspan(u.view(), smaller_view_start_u, smaller_view_end_u); + auto v_smaller_view = dr::mp::views::submdspan(v.view(), smaller_view_start_v, smaller_view_end_v); + auto e1_smaller_view = dr::mp::views::submdspan(e1.view(), smaller_view_start_e, smaller_view_end_e); + auto u1_smaller_view = dr::mp::views::submdspan(u1.view(), smaller_view_start_u, smaller_view_end_u); + auto v1_smaller_view = dr::mp::views::submdspan(v1.view(), smaller_view_start_v, smaller_view_end_v); + auto e2_smaller_view = dr::mp::views::submdspan(e2.view(), smaller_view_start_e, smaller_view_end_e); + auto u2_smaller_view = dr::mp::views::submdspan(u2.view(), smaller_view_start_u, smaller_view_end_u); + auto v2_smaller_view = dr::mp::views::submdspan(v2.view(), smaller_view_start_v, smaller_view_end_v); + auto dedt_smaller_view = dr::mp::views::submdspan(dedt.view(), smaller_view_start_e, smaller_view_end_e); + auto dudt_smaller_view = dr::mp::views::submdspan(dudt.view(), smaller_view_start_u, smaller_view_end_u); + auto dvdt_smaller_view = dr::mp::views::submdspan(dvdt.view(), smaller_view_start_v, smaller_view_end_v); + + dr::mp::fill(dedt, 0); + dr::mp::fill(dudt, 0); + dr::mp::fill(dvdt, 0); + dr::mp::halo(dedt).exchange_begin(); + dr::mp::halo(dudt).exchange_begin(); + dr::mp::halo(dvdt).exchange_begin(); + + auto init_op = [xmin, ymin, grid](auto index, auto v) { + auto &[o] = v; + + std::size_t global_i = index[0]; + if (global_i > 0) { + std::size_t global_j = index[1]; + T x = xmin + grid.dx / 2 + (global_i - 1) * grid.dx; + T y = ymin + grid.dy / 2 + global_j * grid.dy; + o = initial_elev(x, y, grid.lx, grid.ly); + } + }; + dr::mp::for_each(init_op, e); + + dr::mp::halo(e).exchange_begin(); + dr::mp::halo(u).exchange_begin(); + dr::mp::halo(v).exchange_begin(); + +// if (!fused_kernels) { +// dr::mp::halo(e1).exchange_begin(); +// dr::mp::halo(u1).exchange_begin(); +// dr::mp::halo(v1).exchange_begin(); +// dr::mp::halo(e2).exchange_begin(); +// dr::mp::halo(u2).exchange_begin(); +// dr::mp::halo(v2).exchange_begin(); +// } + + auto add = [](auto ops) { return ops.first + ops.second; }; + auto max = [](double x, double y) { return std::max(x, y); }; + auto rk_update2 = [](auto ops) { + return 0.75 * std::get<0>(ops) + + 0.25 * (std::get<1>(ops) + std::get<2>(ops)); + }; + auto rk_update3 = [](auto ops) { + return 1.0 / 3.0 * std::get<0>(ops) + + 2.0 / 3.0 * (std::get<1>(ops) + std::get<2>(ops)); + }; + + std::size_t i_export = 0; + double next_t_export = 0.0; + double t = 0.0; + double initial_v = 0.0; + auto tic = std::chrono::steady_clock::now(); +#ifdef DEBUG + nt = 5; +#endif + for (std::size_t i = 0; i < nt + 1; i++) { + t = i * dt; + + if (t >= next_t_export - 1e-8) { + + double elev_max = dr::mp::reduce(e, static_cast(0), max); + double u_max = dr::mp::reduce(u, static_cast(0), max); + + double total_v = (dr::mp::reduce(e, static_cast(0), std::plus{}) + h) * + grid.dx * grid.dy; + if (i == 0) { + initial_v = total_v; + } + double diff_v = total_v - initial_v; + + if (comm_rank == 0) { + printf("%2lu %4lu %.3f ", i_export, i, t); + printf("elev=%7.5f ", elev_max); + printf("u=%7.5f ", u_max); + printf("dV=% 6.3e ", diff_v); + printf("\n"); + } + if (elev_max > 1e3) { + if (comm_rank == 0) { + std::cout << "Invalid elevation value: " << elev_max << std::endl; + } + return 1; + } + i_export += 1; + next_t_export = i_export * t_export; + } + + // step + iter_callback(); + if (fused_kernels) { + stage1(u, v, e, u1, v1, e1, g, h, grid.dx_inv, grid.dy_inv, dt); + stage2(u, v, e, u1, v1, e1, u2, v2, e2, g, h, grid.dx_inv, grid.dy_inv, + dt); + stage3(u, v, e, u2, v2, e2, g, h, grid.dx_inv, grid.dy_inv, dt); + } else { + // First phase without communication + if (i % 2 == 1) { + // RK stage 1: u1 = u + dt*rhs(u) + rhs(u, v, e, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt, 0); + dr::mp::transform(dr::mp::views::zip(u, dudt), u1.begin(), add); + dr::mp::transform(dr::mp::views::zip(v, dvdt), v1.begin(), add); + dr::mp::transform(dr::mp::views::zip(e, dedt), e1.begin(), add); +// dr::mp::transform(dr::mp::views::zip(u_smaller_view, dudt_smaller_view), u1_smaller_view.begin(), add); +// dr::mp::transform(dr::mp::views::zip(v_smaller_view, dvdt_smaller_view), v1_smaller_view.begin(), add); +// dr::mp::transform(dr::mp::views::zip(e_smaller_view, dedt_smaller_view), e1_smaller_view.begin(), add); + + // RK stage 2: u2 = 0.75*u + 0.25*(u1 + dt*rhs(u1)) + rhs(u1, v1, e1, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt, 0); + dr::mp::transform(dr::mp::views::zip(u, u1, dudt), u2.begin(), rk_update2); + dr::mp::transform(dr::mp::views::zip(v, v1, dvdt), v2.begin(), rk_update2); + dr::mp::transform(dr::mp::views::zip(e, e1, dedt), e2.begin(), rk_update2); +// dr::mp::transform(dr::mp::views::zip(u_smaller_view, u1_smaller_view, dudt_smaller_view),u2_smaller_view.begin(), rk_update2); +// dr::mp::transform(dr::mp::views::zip(v_smaller_view, v1_smaller_view, dvdt_smaller_view),v2_smaller_view.begin(), rk_update2); +// dr::mp::transform(dr::mp::views::zip(e_smaller_view, e1_smaller_view, dedt_smaller_view),e2_smaller_view.begin(), rk_update2); + + // RK stage 3: u3 = 1/3*u + 2/3*(u2 + dt*rhs(u2)) + rhs(u2, v2, e2, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt, 0); + dr::mp::transform(dr::mp::views::zip(u, u2, dudt), u.begin(), rk_update3); + dr::mp::transform(dr::mp::views::zip(v, v2, dvdt), v.begin(), rk_update3); + dr::mp::transform(dr::mp::views::zip(e, e2, dedt), e.begin(), rk_update3); +// dr::mp::transform(dr::mp::views::zip(u_smaller_view, u2_smaller_view, dudt_smaller_view),u_smaller_view.begin(), rk_update3); +// dr::mp::transform(dr::mp::views::zip(v_smaller_view, v2_smaller_view, dvdt_smaller_view),v_smaller_view.begin(), rk_update3); +// dr::mp::transform(dr::mp::views::zip(e_smaller_view, e2_smaller_view, dedt_smaller_view),e_smaller_view.begin(), rk_update3); + } else { + dr::mp::halo(e).exchange_finalize(); + dr::mp::halo(u).exchange_finalize(); + dr::mp::halo(v).exchange_finalize(); + // Second phase with communication + // RK stage 1: u1 = u + dt*rhs(u) + rhs(u, v, e, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt, 0); +// dr::mp::transform(dr::mp::views::zip(u, dudt), u1.begin(), add); +// dr::mp::transform(dr::mp::views::zip(v, dvdt), v1.begin(), add); +// dr::mp::transform(dr::mp::views::zip(e, dedt), e1.begin(), add); + dr::mp::transform(dr::mp::views::zip(u_smaller_view, dudt_smaller_view), u1_smaller_view.begin(), add); + dr::mp::transform(dr::mp::views::zip(v_smaller_view, dvdt_smaller_view), v1_smaller_view.begin(), add); + dr::mp::transform(dr::mp::views::zip(e_smaller_view, dedt_smaller_view), e1_smaller_view.begin(), add); + + dr::mp::halo(u1).exchange_begin(); + dr::mp::halo(v1).exchange_begin(); + dr::mp::halo(e1).exchange_begin(); + + dr::mp::halo(u1).exchange_finalize(); + dr::mp::halo(v1).exchange_finalize(); + dr::mp::halo(e1).exchange_finalize(); + + // RK stage 2: u2 = 0.75*u + 0.25*(u1 + dt*rhs(u1)) + rhs(u1, v1, e1, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt, 0); +// dr::mp::transform(dr::mp::views::zip(u, u1, dudt), u2.begin(), rk_update2); +// dr::mp::transform(dr::mp::views::zip(v, v1, dvdt), v2.begin(), rk_update2); +// dr::mp::transform(dr::mp::views::zip(e, e1, dedt), e2.begin(), rk_update2); + dr::mp::transform(dr::mp::views::zip(u_smaller_view, u1_smaller_view, dudt_smaller_view),u2_smaller_view.begin(), rk_update2); + dr::mp::transform(dr::mp::views::zip(v_smaller_view, v1_smaller_view, dvdt_smaller_view),v2_smaller_view.begin(), rk_update2); + dr::mp::transform(dr::mp::views::zip(e_smaller_view, e1_smaller_view, dedt_smaller_view),e2_smaller_view.begin(), rk_update2); + + dr::mp::halo(u2).exchange_begin(); + dr::mp::halo(v2).exchange_begin(); + dr::mp::halo(e2).exchange_begin(); + + dr::mp::halo(u2).exchange_finalize(); + dr::mp::halo(v2).exchange_finalize(); + dr::mp::halo(e2).exchange_finalize(); + + // RK stage 3: u3 = 1/3*u + 2/3*(u2 + dt*rhs(u2)) + rhs(u2, v2, e2, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt, 0); +// dr::mp::transform(dr::mp::views::zip(u, u2, dudt), u.begin(), rk_update3); +// dr::mp::transform(dr::mp::views::zip(v, v2, dvdt), v.begin(), rk_update3); +// dr::mp::transform(dr::mp::views::zip(e, e2, dedt), e.begin(), rk_update3); + dr::mp::transform(dr::mp::views::zip(u_smaller_view, u2_smaller_view, dudt_smaller_view),u_smaller_view.begin(), rk_update3); + dr::mp::transform(dr::mp::views::zip(v_smaller_view, v2_smaller_view, dvdt_smaller_view),v_smaller_view.begin(), rk_update3); + dr::mp::transform(dr::mp::views::zip(e_smaller_view, e2_smaller_view, dedt_smaller_view),e_smaller_view.begin(), rk_update3); + + dr::mp::halo(u).exchange_begin(); + dr::mp::halo(v).exchange_begin(); + dr::mp::halo(e).exchange_begin(); + + } +#ifdef DEBUG + std::cout << "Iter " << i << "\n"; + debug_print_arr(nx + 1, ny, e, "e"); + debug_print_arr(nx + 1, ny, u, "u"); + debug_print_arr(nx + 1, ny + 1, v, "v"); + debug_print_arr(nx + 1, ny, e1, "e1"); + debug_print_arr(nx + 1, ny, u1, "u1"); + debug_print_arr(nx + 1, ny + 1, v1, "v1"); + debug_print_arr(nx + 1, ny, e2, "e2"); + debug_print_arr(nx + 1, ny, u2, "u2"); + debug_print_arr(nx + 1, ny + 1, v2, "v2"); + debug_print_arr(nx + 1, ny, dedt, "dedt"); + debug_print_arr(nx + 1, ny, dudt, "dudt"); + debug_print_arr(nx + 1, ny + 1, dvdt, "dvdt"); +#endif + } + } + dr::mp::halo(u).exchange_finalize(); + dr::mp::halo(v).exchange_finalize(); + dr::mp::halo(e).exchange_finalize(); + auto toc = std::chrono::steady_clock::now(); + std::chrono::duration duration = toc - tic; + if (comm_rank == 0) { + double t_cpu = duration.count(); + double t_step = t_cpu / nt; + double read_bw = double(nread) / t_step / (1024 * 1024 * 1024); + double write_bw = double(nwrite) / t_step / (1024 * 1024 * 1024); + double flop_rate = double(nflop) / t_step / (1000 * 1000 * 1000); + double ai = double(nflop) / double(nread + nwrite); + std::cout << "Duration: " << std::setprecision(3) << t_cpu; + std::cout << " s" << std::endl; + std::cout << "Time per step: " << std::setprecision(2) << t_step * 1000; + std::cout << " ms" << std::endl; + std::cout << "Reads : " << std::setprecision(3) << read_bw; + std::cout << " GB/s" << std::endl; + std::cout << "Writes: " << std::setprecision(3) << write_bw; + std::cout << " GB/s" << std::endl; + std::cout << "FLOP/s: " << std::setprecision(3) << flop_rate; + std::cout << " GFLOP/s" << std::endl; + std::cout << "Arithmetic intensity: " << std::setprecision(5) << ai; + std::cout << " FLOP/Byte" << std::endl; + } + + // Compute error against exact solution + Array e_exact({nx + 1, ny}, dist); + dr::mp::fill(e_exact, 0.0); + Array error({nx + 1, ny}, dist); + + auto exact_op = [xmin, ymin, grid, t](auto index, auto v) { + auto &[o] = v; + + std::size_t global_i = index[0]; + if (global_i > 0) { + std::size_t global_j = index[1]; + T x = xmin + grid.dx / 2 + (global_i - 1) * grid.dx; + T y = ymin + grid.dy / 2 + global_j * grid.dy; + o = exact_elev(x, y, t, grid.lx, grid.ly); + } + }; + dr::mp::for_each(exact_op, e_exact); + dr::mp::halo(e_exact).exchange(); + auto error_kernel = [](auto ops) { + auto err = ops.first - ops.second; + return err * err; + }; + dr::mp::transform(dr::mp::views::zip(e, e_exact), error.begin(), + error_kernel); + double err_L2 = dr::mp::reduce(error, static_cast(0), std::plus{}) * + grid.dx * grid.dy / grid.lx / grid.ly; + err_L2 = std::sqrt(err_L2); + if (comm_rank == 0) { + std::cout << "L2 error: " << std::setw(7) << std::scientific; + std::cout << std::setprecision(5) << err_L2 << std::endl; + } + + if (benchmark_mode) { + return 0; + } + if (nx < 128 || ny < 128) { + if (comm_rank == 0) { + std::cout << "Skipping correctness test due to small problem size." + << std::endl; + } + } else if (nx == 128 && ny == 128) { + double expected_L2 = 0.007224068445111; + double rel_tolerance = 1e-6; + double rel_err = err_L2 / expected_L2 - 1.0; + if (!(fabs(rel_err) < rel_tolerance)) { + if (comm_rank == 0) { + std::cout << "ERROR: L2 error deviates from reference value: " + << expected_L2 << ", relative error: " << rel_err + << std::endl; + } + return 1; + } + } else { + double tolerance = 1e-2; + if (!(err_L2 < tolerance)) { + if (comm_rank == 0) { + std::cout << "ERROR: L2 error exceeds tolerance: " << err_L2 << " > " + << tolerance << std::endl; + } + return 1; + } + } + if (comm_rank == 0) { + std::cout << "SUCCESS" << std::endl; + } + + return 0; +} + +} // namespace WaveEquation + +#ifdef STANDALONE_BENCHMARK + +int main(int argc, char *argv[]) { + + MPI_Init(&argc, &argv); + comm = MPI_COMM_WORLD; + MPI_Comm_rank(comm, &comm_rank); + MPI_Comm_size(comm, &comm_size); + + cxxopts::Options options_spec(argv[0], "wave equation"); + // clang-format off + options_spec.add_options() +#ifndef DEBUG + ("n", "Grid size", cxxopts::value()->default_value("128")) +#else + ("n", "Grid size", cxxopts::value()->default_value("16")) +#endif + ("t,benchmark-mode", "Run a fixed number of time steps.", cxxopts::value()->default_value("false")) + ("sycl", "Execute on SYCL device") + ("l,log", "enable logging") + ("logprefix", "appended .RANK.log", cxxopts::value()->default_value("dr")) + ("f,fused-kernel", "Use fused kernels.", cxxopts::value()->default_value("false")) + ("device-memory", "Use device memory") + ("h,help", "Print help"); + // clang-format on + + cxxopts::ParseResult options; + try { + options = options_spec.parse(argc, argv); + } catch (const cxxopts::OptionParseException &e) { + std::cout << options_spec.help() << "\n"; + exit(1); + } + + std::unique_ptr logfile; + if (options.count("log")) { + logfile.reset(new std::ofstream(options["logprefix"].as() + + fmt::format(".{}.log", comm_rank))); + dr::drlog.set_file(*logfile); + } + + if (options.count("sycl")) { +#ifdef SYCL_LANGUAGE_VERSION + sycl::queue q = dr::mp::select_queue(); + std::cout << "Run on: " + << q.get_device().get_info() << "\n"; + dr::mp::init(q, options.count("device-memory") ? sycl::usm::alloc::device + : sycl::usm::alloc::shared); +#else + std::cout << "Sycl support requires icpx\n"; + exit(1); +#endif + } else { + if (comm_rank == 0) { + std::cout << "Run on: CPU\n"; + } + dr::mp::init(); + } + + std::size_t n = options["n"].as(); + bool benchmark_mode = options["t"].as(); + bool fused_kernels = options["f"].as(); + + auto error = WaveEquation::run(n, benchmark_mode, fused_kernels); + dr::mp::finalize(); + MPI_Finalize(); + return error; +} + +#else + +static void WaveEquation_DR(benchmark::State &state) { + + int n = ::sqrtl(default_vector_size); + + // ugly hack to make it working in reasonable time in benchmarking framework + // drbench.py should specify right size or there should be another size option + // to use here instead of default_vector_size + n /= 4; + + std::size_t nread, nwrite, nflop; + WaveEquation::calculate_complexity(n, n, nread, nwrite, nflop); + Stats stats(state, nread, nwrite, nflop); + + auto iter_callback = [&stats]() { stats.rep(); }; + for (auto _ : state) { + WaveEquation::run(n, true, true, iter_callback); + } +} + +DR_BENCHMARK(WaveEquation_DR); + +#endif diff --git a/benchmarks/gbench/mp/wave_equation.cpp b/benchmarks/gbench/mp/wave_equation.cpp index ee0a5b9799..4b8dc33c92 100644 --- a/benchmarks/gbench/mp/wave_equation.cpp +++ b/benchmarks/gbench/mp/wave_equation.cpp @@ -313,6 +313,15 @@ void stage3(Array &u, Array &v, Array &e, Array &u2, Array &v2, Array &e2, dr::mp::halo(e).exchange_begin(); }; +//#define DEBUG + +#ifdef DEBUG + void debug_print_arr(std::size_t n, std::size_t m, const Array& arr, const std::string& str) { + std::cout << "Array " << str << ":\n"; + std::cout << arr << "\n"; +} +#endif + int run( int n, bool benchmark_mode, bool fused_kernels, std::function iter_callback = []() {}) { @@ -431,6 +440,9 @@ int run( double t = 0.0; double initial_v = 0.0; auto tic = std::chrono::steady_clock::now(); +#ifdef DEBUG + nt = 5; +#endif for (std::size_t i = 0; i < nt + 1; i++) { t = i * dt; @@ -500,6 +512,21 @@ int run( dr::mp::halo(v).exchange_begin(); dr::mp::transform(dr::mp::views::zip(e, e2, dedt), e.begin(), rk_update3); dr::mp::halo(e).exchange_begin(); +#ifdef DEBUG + std::cout << "Iter " << i << "\n"; + debug_print_arr(nx + 1, ny, e, "e"); + debug_print_arr(nx + 1, ny, u, "u"); + debug_print_arr(nx + 1, ny + 1, v, "v"); + debug_print_arr(nx + 1, ny, e1, "e1"); + debug_print_arr(nx + 1, ny, u1, "u1"); + debug_print_arr(nx + 1, ny + 1, v1, "v1"); + debug_print_arr(nx + 1, ny, e2, "e2"); + debug_print_arr(nx + 1, ny, u2, "u2"); + debug_print_arr(nx + 1, ny + 1, v2, "v2"); + debug_print_arr(nx + 1, ny, dedt, "dedt"); + debug_print_arr(nx + 1, ny, dudt, "dudt"); + debug_print_arr(nx + 1, ny + 1, dvdt, "dvdt"); +#endif } } dr::mp::halo(u).exchange_finalize(); @@ -611,7 +638,11 @@ int main(int argc, char *argv[]) { cxxopts::Options options_spec(argv[0], "wave equation"); // clang-format off options_spec.add_options() +#ifndef DEBUG ("n", "Grid size", cxxopts::value()->default_value("128")) +#else + ("n", "Grid size", cxxopts::value()->default_value("16")) +#endif ("t,benchmark-mode", "Run a fixed number of time steps.", cxxopts::value()->default_value("false")) ("sycl", "Execute on SYCL device") ("l,log", "enable logging") diff --git a/include/dr/mp/halo/halo.hpp b/include/dr/mp/halo/halo.hpp index ce4f6305aa..aa1ec9cae2 100644 --- a/include/dr/mp/halo/halo.hpp +++ b/include/dr/mp/halo/halo.hpp @@ -17,6 +17,7 @@ namespace dr::mp { }; struct halo_bounds { + // How many values before and after the data segment are in halo std::size_t prev = 0, next = 0; bool periodic = false; }; @@ -157,6 +158,7 @@ namespace dr::mp { g.pack(); g.receive = false; DRLOG("sending: {}", g.request_index); +// std::cout << "send(" << g.data_pointer() << ", " << g.data_size() << ", " << g.rank() << ", , " << &requests_[g.request_index] << ")\n"; comm_.isend(g.data_pointer(), g.data_size(), g.rank(), g.tag(), &requests_[g.request_index]); } @@ -166,6 +168,7 @@ namespace dr::mp { for (auto &g: receives) { g.receive = true; DRLOG("receiving: {}", g.request_index); +// std::cout << "recv(" << g.data_pointer() << ", " << g.data_size() << ", " << g.rank() << ", , " << &requests_[g.request_index] << ")\n"; comm_.irecv(g.data_pointer(), g.data_size(), g.rank(), g.tag(), &requests_[g.request_index]); } diff --git a/test/gtest/mp/halo-3.cpp b/test/gtest/mp/halo-3.cpp index dfc372ebd5..95ca25cff7 100644 --- a/test/gtest/mp/halo-3.cpp +++ b/test/gtest/mp/halo-3.cpp @@ -222,3 +222,72 @@ TYPED_TEST(Halo3, dv_halos_prev_0) { EXPECT_EQ(*(dv.begin() + 5).local(), 6); } } + +TYPED_TEST(Halo3, halo_wide) { + TypeParam dv(9, dr::mp::distribution().halo(3)); + fill(dv, 7); + dv.halo().exchange(); + + fill(dv, 13); + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(*(dv.begin() + 0).local(), 13); + EXPECT_EQ(*(dv.begin() + 1).local(), 13); + EXPECT_EQ(*(dv.begin() + 2).local(), 13); + EXPECT_EQ(*(dv.begin() + 3).local(), 7); + EXPECT_EQ(*(dv.begin() + 4).local(), 7); + EXPECT_EQ(*(dv.begin() + 5).local(), 7); + break; + case 1: + EXPECT_EQ(*(dv.begin() + 0).local(), 7); + EXPECT_EQ(*(dv.begin() + 1).local(), 7); + EXPECT_EQ(*(dv.begin() + 2).local(), 7); + EXPECT_EQ(*(dv.begin() + 3).local(), 13); + EXPECT_EQ(*(dv.begin() + 4).local(), 13); + EXPECT_EQ(*(dv.begin() + 5).local(), 13); + EXPECT_EQ(*(dv.begin() + 6).local(), 7); + EXPECT_EQ(*(dv.begin() + 7).local(), 7); + EXPECT_EQ(*(dv.begin() + 8).local(), 7); + break; + case 2: + EXPECT_EQ(*(dv.begin() + 3).local(), 7); + EXPECT_EQ(*(dv.begin() + 4).local(), 7); + EXPECT_EQ(*(dv.begin() + 5).local(), 7); + EXPECT_EQ(*(dv.begin() + 6).local(), 13); + EXPECT_EQ(*(dv.begin() + 7).local(), 13); + EXPECT_EQ(*(dv.begin() + 8).local(), 13); + break; + } + + dv.halo().exchange(); + + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(*(dv.begin() + 0).local(), 13); + EXPECT_EQ(*(dv.begin() + 1).local(), 13); + EXPECT_EQ(*(dv.begin() + 2).local(), 13); + EXPECT_EQ(*(dv.begin() + 3).local(), 13); + EXPECT_EQ(*(dv.begin() + 4).local(), 13); + EXPECT_EQ(*(dv.begin() + 5).local(), 13); + break; + case 1: + EXPECT_EQ(*(dv.begin() + 0).local(), 13); + EXPECT_EQ(*(dv.begin() + 1).local(), 13); + EXPECT_EQ(*(dv.begin() + 2).local(), 13); + EXPECT_EQ(*(dv.begin() + 3).local(), 13); + EXPECT_EQ(*(dv.begin() + 4).local(), 13); + EXPECT_EQ(*(dv.begin() + 5).local(), 13); + EXPECT_EQ(*(dv.begin() + 6).local(), 13); + EXPECT_EQ(*(dv.begin() + 7).local(), 13); + EXPECT_EQ(*(dv.begin() + 8).local(), 13); + break; + case 2: + EXPECT_EQ(*(dv.begin() + 3).local(), 13); + EXPECT_EQ(*(dv.begin() + 4).local(), 13); + EXPECT_EQ(*(dv.begin() + 5).local(), 13); + EXPECT_EQ(*(dv.begin() + 6).local(), 13); + EXPECT_EQ(*(dv.begin() + 7).local(), 13); + EXPECT_EQ(*(dv.begin() + 8).local(), 13); + break; + } +} \ No newline at end of file From 61dd55c519ee5931f5273933bf6cd1ba437d2899 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ch=C4=99tkowski?= Date: Mon, 23 Sep 2024 14:22:12 +0200 Subject: [PATCH 03/19] Wide stencil 1d --- include/dr/mp/algorithms/for_each.hpp | 122 ++++++++++++ .../dr/mp/containers/distributed_vector.hpp | 19 +- include/dr/mp/containers/distribution.hpp | 15 +- include/dr/mp/containers/segment.hpp | 32 ++- include/dr/mp/halo/halo.hpp | 8 + test/gtest/mp/CMakeLists.txt | 12 +- test/gtest/mp/wide-halo-1d-3.cpp | 186 ++++++++++++++++++ 7 files changed, 386 insertions(+), 8 deletions(-) create mode 100644 test/gtest/mp/wide-halo-1d-3.cpp diff --git a/include/dr/mp/algorithms/for_each.hpp b/include/dr/mp/algorithms/for_each.hpp index 8851208198..e57dd03e89 100644 --- a/include/dr/mp/algorithms/for_each.hpp +++ b/include/dr/mp/algorithms/for_each.hpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -15,6 +16,7 @@ #include #include #include +#include namespace dr::mp { @@ -62,4 +64,124 @@ DI for_each_n(DI first, I n, auto op) { return last; } +namespace __detail { + template + using stencil_index_type = dr::__detail::dr_extents; + + void stencil_for_each_extended_1(auto op, stencil_index_type<1> begin, stencil_index_type<1> end, const auto& segs) { + auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); + + auto sub = [](auto a) { return std::get<1>(a) - std::get<0>(a); }; + auto is_zero = [](auto a) { return a != 0; }; + + auto zipped = zip_view(seg0_begin, seg0_end); + auto distance = zipped | std::views::transform(sub); + + if ((distance | std::views::filter(is_zero)).empty()) + return; + + auto seg_infos = dr::__detail::tuple_transform(segs, [begin](auto &&seg) { + return std::make_pair(seg.begin() + seg.begin_stencil(begin)[0], seg.extents()); + }); + + auto do_point = [seg_infos, op](auto index) { + auto stencils = + dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { + return md::mdspan( + std::to_address(dr::ranges::local(seg_info.first + index)), + seg_info.second + ); + }); + op(stencils, index); + }; + if (mp::use_sycl()) { + dr::drlog.debug(" using sycl\n"); + +#ifdef SYCL_LANGUAGE_VERSION + dr::__detail::parallel_for( + dr::mp::sycl_queue(), sycl::range<1>(distance[0]), + do_point) + .wait(); +#else + assert(false); +#endif + } else { + dr::drlog.debug(" using cpu\n"); + for (std::size_t i = 0; i < distance[0]; i++) { + do_point(i); + } + } + } + + void stencil_for_each_extended_2(auto op, stencil_index_type<1> begin, stencil_index_type<1> end, const auto& segs) { + auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); + + auto sub = [](auto a) { return std::get<1>(a) - std::get<0>(a); }; + auto is_zero = [](auto a) { return a != 0; }; + + auto zipped = zip_view(seg0_begin, seg0_end); + auto distance = zipped | std::views::transform(sub); + + if ((distance | std::views::filter(is_zero)).empty()) + return; + + auto seg_infos = dr::__detail::tuple_transform(segs, [begin](auto &&seg) { + return std::make_pair(seg.begin() + seg.begin_stencil(begin)[0], seg.extents()); + }); + + auto do_point = [seg_infos, op](auto index) { + auto stencils = + dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { + return md::mdspan( + std::to_address(dr::ranges::local(seg_info.first + index)), + seg_info.second + ); + }); + op(stencils, index); + }; + if (mp::use_sycl()) { + dr::drlog.debug(" using sycl\n"); + +#ifdef SYCL_LANGUAGE_VERSION + dr::__detail::parallel_for( + dr::mp::sycl_queue(), sycl::range<2>(distance[0], distance[1]), + do_point) + .wait(); +#else + assert(false); +#endif + } else { + dr::drlog.debug(" using cpu\n"); + for (std::size_t i = 0; i < distance[0]; i++) { + for (std::size_t i = 0; i < distance[1]; i++) { + do_point(i); + } + } + } + } +} + +template +requires (1 <= Rank && Rank <= 3) +void stencil_for_each_extended(auto op, __detail::stencil_index_type begin, __detail::stencil_index_type end, dr::distributed_range auto &&...drs) { + dr::drlog.debug(dr::logger::for_each, "for_each_extended: parallel execution\n"); + auto ranges = std::tie(drs...); + auto &&dr0 = std::get<0>(ranges); + if (rng::empty(dr0)) { + return; + } + + auto all_segments = rng::views::zip(dr::ranges::segments(drs)...); + for (const auto &segs : all_segments) { + if constexpr (Rank == 1) { + __detail::stencil_for_each_extended_1(op, begin, end, segs); + } + else if constexpr (Rank == 2) { + __detail::stencil_for_each_extended_2(op, begin, end, segs); + } + else if constexpr (Rank == 3) {} + } + barrier(); +} + } // namespace dr::mp diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp index 2611963064..02f0244992 100644 --- a/include/dr/mp/containers/distributed_vector.hpp +++ b/include/dr/mp/containers/distributed_vector.hpp @@ -276,6 +276,9 @@ template class distributed_vector { void fence() { backend.fence(); } + auto &dist() const { + return distribution_; + } private: void init(auto size, auto dist) { size_ = size; @@ -292,6 +295,14 @@ template class distributed_vector { segment_size_ = gran * std::max({(size / gran + comm_size - 1) / comm_size, hb.prev / gran, hb.next / gran}); + __detail::extended_local_data_distribution ext_dist; + if (default_comm().rank() * segment_size_ >= hb.prev) + ext_dist.begin = default_comm().rank() * segment_size_ - hb.prev; + else + ext_dist.begin = 0; + ext_dist.end = std::min((default_comm().rank() + 1) * segment_size_ + hb.next, size_); + ext_dist.segment_size = segment_size_; + data_size_ = segment_size_ + hb.prev + hb.next; if (size_ > 0) { @@ -302,8 +313,12 @@ template class distributed_vector { std::size_t segment_index = 0; for (std::size_t i = 0; i < size; i += segment_size_) { - segments_.emplace_back(this, segment_index++, - std::min(segment_size_, size - i), data_size_); + segments_.emplace_back( + this, + segment_index++, + std::min(segment_size_, size - i), + data_size_, + ext_dist); } fence(); diff --git a/include/dr/mp/containers/distribution.hpp b/include/dr/mp/containers/distribution.hpp index 44f4f43eb4..d933520dd4 100644 --- a/include/dr/mp/containers/distribution.hpp +++ b/include/dr/mp/containers/distribution.hpp @@ -22,7 +22,19 @@ struct distribution { return *this; } - auto halo() const { return halo_bounds_; } + auto halo() const { + halo_bounds halo_bounds_resized = halo_bounds_; + halo_bounds_resized.prev *= redundancy_; + halo_bounds_resized.next *= redundancy_; + return halo_bounds_resized; + } + + distribution &redundancy(std::size_t redundancy) { + redundancy_ = redundancy; + return *this; + } + + auto redundancy() const { return redundancy_; } distribution &periodic(bool periodic) { halo_bounds_.periodic = periodic; @@ -40,6 +52,7 @@ struct distribution { private: halo_bounds halo_bounds_; + std::size_t redundancy_ = 1; std::size_t granularity_ = 1; }; diff --git a/include/dr/mp/containers/segment.hpp b/include/dr/mp/containers/segment.hpp index 56724ac61b..5f6b4f020a 100644 --- a/include/dr/mp/containers/segment.hpp +++ b/include/dr/mp/containers/segment.hpp @@ -6,6 +6,14 @@ namespace dr::mp { +namespace __detail { + struct extended_local_data_distribution { + std::size_t begin; + std::size_t end; + std::size_t segment_size; + }; +} // __detail + template class dv_segment_iterator; template class dv_segment_reference { @@ -215,15 +223,21 @@ template class dv_segment { private: using iterator = dv_segment_iterator; + using stencil_index_type = dr::__detail::dr_extents<1>; public: using difference_type = std::ptrdiff_t; dv_segment() = default; dv_segment(DV *dv, std::size_t segment_index, std::size_t size, - std::size_t reserved) { + std::size_t reserved, const __detail::extended_local_data_distribution& ext_dist) { dv_ = dv; segment_index_ = segment_index; size_ = size; reserved_ = reserved; + ext_dist_ = ext_dist; + + begin_index_ = segment_index * ext_dist.segment_size; + end_index_ = segment_index * ext_dist.segment_size + size_; + assert(dv_ != nullptr); } @@ -236,15 +250,29 @@ template class dv_segment { auto end() const { return begin() + size(); } auto reserved() const { return reserved_; } + [[nodiscard]] stencil_index_type begin_stencil(stencil_index_type stencil) const { + return {std::min(std::max(begin_index_, ext_dist_.begin + stencil[0]), end_index_) - begin_index_}; + } + [[nodiscard]] stencil_index_type end_stencil(stencil_index_type stencil) const { + return {std::max(std::min(end_index_, ext_dist_.end - stencil[0]), begin_index_) - begin_index_}; + } + [[nodiscard]] std::pair stencil(stencil_index_type begin, stencil_index_type end) const { + return {begin_stencil(begin), end_stencil(end)}; + } + auto extents() const { return md::extents(reserved_); } + auto operator[](difference_type n) const { return *(begin() + n); } bool is_local() const { return segment_index_ == default_comm().rank(); } - private: DV *dv_ = nullptr; std::size_t segment_index_; std::size_t size_; std::size_t reserved_; + + std::size_t begin_index_; + std::size_t end_index_; + __detail::extended_local_data_distribution ext_dist_; }; // dv_segment // diff --git a/include/dr/mp/halo/halo.hpp b/include/dr/mp/halo/halo.hpp index aa1ec9cae2..cfe1d49f08 100644 --- a/include/dr/mp/halo/halo.hpp +++ b/include/dr/mp/halo/halo.hpp @@ -182,4 +182,12 @@ namespace dr::mp { std::vector map_; Memory memory_; }; + + template + void halo_exchange(auto&& f, T &dv, Ts &...dvs) { + for (std::size_t step = 0; step < dv.dist().redundancy(); step++) { + f(dv, dvs...); + } + halo(dv).exchange(); + } } diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index a300d47d1e..4868dfb81c 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -42,14 +42,20 @@ add_executable( stencil.cpp segments.cpp slide_view.cpp - wave_kernel.cpp) + wave_kernel.cpp + wide-halo-1d-3.cpp + wide-halo-2d-3.cpp +) add_executable( mp-tests-3 mp-tests.cpp communicator-3.cpp halo-3.cpp - slide_view-3.cpp) + slide_view-3.cpp + wide-halo-1d-3.cpp + wide-halo-2d-3.cpp +) # mp-quick-test and mp-quick-test-3-only is for development. By reducing the number of source files, it # builds much faster. Change the source files to match what you need to test. It @@ -61,7 +67,7 @@ add_executable(mp-quick-test ) add_executable(mp-quick-test-3-only mp-tests.cpp - halo-3.cpp + wide-halo-1d-3.cpp ) # cmake-format: on diff --git a/test/gtest/mp/wide-halo-1d-3.cpp b/test/gtest/mp/wide-halo-1d-3.cpp new file mode 100644 index 0000000000..7b0d4890b8 --- /dev/null +++ b/test/gtest/mp/wide-halo-1d-3.cpp @@ -0,0 +1,186 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" + +template class WideHalo3 : public testing::Test {}; + +using T = int; +using Array = dr::mp::distributed_vector; + +const std::size_t redundancy = 2; +const std::size_t size = 6; + +dr::mp::distribution get_distribution() { + return dr::mp::distribution() + .halo(1) + .redundancy(redundancy); +} + +int& get(Array& v, std::size_t i) { + return *(v.begin() + i).local(); +} + +TEST(WideHalo3, suite_works_for_3_processes_only) { + EXPECT_EQ(dr::mp::default_comm().size(), 3); +} + +TEST(WideHalo3, halo_is_visible_after_exchange_not_earlier) { + dr::mp::distribution dist = get_distribution(); + Array dv(size, dist); + Array dv_out(size, dist); + + fill(dv, 1); + fill(dv_out, 1); + dv.halo().exchange(); + dv_out.halo().exchange(); + + auto print = [&](const auto& v) { + for (auto seg : v.segments()) { + for (auto i = seg.begin_stencil({0ul})[0]; i < seg.end_stencil({0ul})[0]; i++) { + std::cout << *(seg.begin() + i).local() << " "; + } + } + std::cout << "\n"; + }; + + auto transform = [&]{ + stencil_for_each_extended<1>([](auto stencils, auto id){ + auto [x, x_out] = stencils; + x_out(0) = x(-1) + x(0) + x(1); + }, {1}, {1}, dv, dv_out); + stencil_for_each_extended<1>([](auto stencils, auto id){ + auto [x, x_out] = stencils; + x(0) = x_out(0); + }, {0}, {0}, dv, dv_out); + }; + + transform(); + print(dv); + + // after first step, only actually stored values and their neighbours are guaranteed to be correct + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 3); + EXPECT_EQ(get(dv, 2), 3); + EXPECT_EQ(get(dv, 3), 1); + break; + case 1: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 3); + EXPECT_EQ(get(dv, 2), 3); + EXPECT_EQ(get(dv, 3), 3); + EXPECT_EQ(get(dv, 4), 3); + EXPECT_EQ(get(dv, 5), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2), 1); + EXPECT_EQ(get(dv, 3), 3); + EXPECT_EQ(get(dv, 4), 3); + EXPECT_EQ(get(dv, 5), 1); + break; + } + + // after second step, only actually stored values are guaranteed to be correct + + transform(); + print(dv); + + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 7); + EXPECT_EQ(get(dv, 3), 1); + break; + case 1: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2), 1); + EXPECT_EQ(get(dv, 3), 7); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + } + + // after exchange all are correct + dv.halo().exchange(); + print(dv); + + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + break; + case 1: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + } +} + +TEST(WideHalo3, halo_api_works) { + dr::mp::distribution dist = get_distribution(); + Array dv(size, dist); + Array dv_out(size, dist); + + fill(dv, 1); + fill(dv_out, 1); + dv.halo().exchange(); + dv_out.halo().exchange(); + + halo_exchange([](Array& dv, Array& dv_out){ + stencil_for_each_extended<1>([](auto stencils, auto id){ + auto [x, x_out] = stencils; + x_out(0) = x(-1) + x(0) + x(1); + }, {1}, {1}, dv, dv_out); + stencil_for_each_extended<1>([](auto stencils, auto id){ + auto [x, x_out] = stencils; + x(0) = x_out(0); + }, {0}, {0}, dv, dv_out); + }, dv, dv_out); + // after exchange all are correct + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + break; + case 1: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + } +} + From fc411746932511bc0d11d1ec094d1ad695ac5774 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ch=C4=99tkowski?= Date: Wed, 2 Oct 2024 18:15:27 +0200 Subject: [PATCH 04/19] Wide stencil 2d --- benchmarks/gbench/mp/CMakeLists.txt | 8 +- benchmarks/gbench/mp/wave_equation-wide.cpp | 207 ++++++++---------- benchmarks/gbench/mp/wave_equation.cpp | 16 +- include/dr/mp/algorithms/for_each.hpp | 65 +++--- include/dr/mp/algorithms/md_for_each.hpp | 1 + .../dr/mp/containers/distributed_mdarray.hpp | 12 +- .../dr/mp/containers/distributed_vector.hpp | 10 +- include/dr/mp/containers/distribution.hpp | 18 ++ include/dr/mp/containers/segment.hpp | 14 +- include/dr/mp/views/mdspan_view.hpp | 94 +++++--- test/gtest/mp/CMakeLists.txt | 2 +- test/gtest/mp/wide-halo-1d-3.cpp | 8 +- test/gtest/mp/wide-halo-2d-3.cpp | 125 +++++++++++ 13 files changed, 375 insertions(+), 205 deletions(-) create mode 100644 test/gtest/mp/wide-halo-2d-3.cpp diff --git a/benchmarks/gbench/mp/CMakeLists.txt b/benchmarks/gbench/mp/CMakeLists.txt index 24d50a1e50..6095a65807 100644 --- a/benchmarks/gbench/mp/CMakeLists.txt +++ b/benchmarks/gbench/mp/CMakeLists.txt @@ -88,13 +88,13 @@ add_mp_ctest(NAME wave_equation-wide) # DRA-92 if(ENABLE_SYCL) add_mp_ctest( - TEST_NAME wave_equation-sycl NAME wave_equation TIMEOUT 200 NPROC 8 SYCL) + TEST_NAME wave_equation-sycl NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL) add_mp_ctest( - TEST_NAME wave_equation_fused-sycl NAME wave_equation TIMEOUT 200 NPROC 2 SYCL TARGS -f) + TEST_NAME wave_equation_fused-sycl NAME wave_equation TIMEOUT 1000 NPROC 2 SYCL TARGS -f) add_mp_ctest( - TEST_NAME wave_equation-wide-sycl NAME wave_equation-wide TIMEOUT 200 NPROC 8 SYCL) + TEST_NAME wave_equation-wide-sycl NAME wave_equation-wide TIMEOUT 1000 NPROC 8 SYCL) add_mp_ctest( - TEST_NAME wave_equation-wide_fused-sycl NAME wave_equation-wide TIMEOUT 200 NPROC 2 SYCL TARGS -f) + TEST_NAME wave_equation-wide_fused-sycl NAME wave_equation-wide TIMEOUT 1000 NPROC 2 SYCL TARGS -f) endif() add_executable(shallow_water shallow_water.cpp) diff --git a/benchmarks/gbench/mp/wave_equation-wide.cpp b/benchmarks/gbench/mp/wave_equation-wide.cpp index 744a7503c9..9fcf3fe12d 100644 --- a/benchmarks/gbench/mp/wave_equation-wide.cpp +++ b/benchmarks/gbench/mp/wave_equation-wide.cpp @@ -71,8 +71,10 @@ double initial_elev(double x, double y, double lx, double ly) { return exact_elev(x, y, 0.0, lx, ly); } +#define DEBUG + void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, - double g, double h, double dx_inv, double dy_inv, double dt, unsigned long redundancy) { + double g, double h, double dx_inv, double dy_inv, double dt) { /** * Evaluate right hand side of the equations */ @@ -81,11 +83,15 @@ void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, out(0, 0) = -dt * g * (in(1, 0) - in(0, 0)) * dx_inv; }; { - std::array start{1 + redundancy, 0}; - std::array end{e.extent(0) - 1 - redundancy, e.extent(1)}; - auto e_view = dr::mp::views::submdspan(e.view(), start, end); - auto dudt_view = dr::mp::views::submdspan(dudt.view(), start, end); - dr::mp::stencil_for_each(rhs_dedx, e_view, dudt_view); +#ifdef DEBUG + std::cout << "stage1\n"; +#endif +// std::array start{1, 0}; +// std::array end{e.extent(0) - 1, e.extent(1)}; +// auto e_view = dr::mp::views::submdspan(e.view(), start, end); +// auto dudt_view = dr::mp::views::submdspan(dudt.view(), start, end); +// dr::mp::stencil_for_each(rhs_dedx, e_view, dudt_view); + stencil_for_each_extended<2>(rhs_dedx, {0, 0}, {1, 0}, e, dudt); } auto rhs_dedy = [dt, g, dy_inv](auto v) { @@ -93,11 +99,15 @@ void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, out(0, 0) = -dt * g * (in(0, 0) - in(0, -1)) * dy_inv; }; { - std::array start{0 + redundancy, 1}; - std::array end{e.extent(0) - redundancy, e.extent(1)}; - auto e_view = dr::mp::views::submdspan(e.view(), start, end); - auto dvdt_view = dr::mp::views::submdspan(dvdt.view(), start, end); - dr::mp::stencil_for_each(rhs_dedy, e_view, dvdt_view); +#ifdef DEBUG + std::cout << "stage2\n"; +#endif +// std::array start{0, 1}; +// std::array end{e.extent(0), e.extent(1)}; +// auto e_view = dr::mp::views::submdspan(e.view(), start, end); +// auto dvdt_view = dr::mp::views::submdspan(dvdt.view(), start, end); +// dr::mp::stencil_for_each(rhs_dedy, e_view, dvdt_view); + stencil_for_each_extended<2>(rhs_dedy, {0, 1}, {0, 0}, e, dvdt); } auto rhs_div = [dt, h, dx_inv, dy_inv](auto args) { @@ -107,13 +117,20 @@ void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, out(0, 0) = -dt * h * (dudx + dvdy); }; { - std::array start{1 + redundancy, 0}; - std::array end{u.extent(0) - redundancy, u.extent(1)}; - auto u_view = dr::mp::views::submdspan(u.view(), start, end); - auto v_view = dr::mp::views::submdspan(v.view(), start, end); - auto dedt_view = dr::mp::views::submdspan(dedt.view(), start, end); - dr::mp::stencil_for_each(rhs_div, u_view, v_view, dedt_view); +#ifdef DEBUG + std::cout << "stage3\n"; +#endif +// std::array start{1, 0}; +// std::array end{u.extent(0), u.extent(1)}; +// auto u_view = dr::mp::views::submdspan(u.view(), start, end); +// auto v_view = dr::mp::views::submdspan(v.view(), start, end); +// auto dedt_view = dr::mp::views::submdspan(dedt.view(), start, end); +// dr::mp::stencil_for_each(rhs_div, u_view, v_view, dedt_view); + stencil_for_each_extended<2>(rhs_div, {1, 0}, {0, 0}, u, v, dedt); } +#ifdef DEBUG + std::cout << "after\n"; +#endif }; void stage1(Array &u, Array &v, Array &e, Array &u1, Array &v1, Array &e1, @@ -310,8 +327,6 @@ void stage3(Array &u, Array &v, Array &e, Array &u2, Array &v2, Array &e2, dr::mp::halo(e).exchange_begin(); }; -//#define DEBUG - #ifdef DEBUG void debug_print_arr(std::size_t n, std::size_t m, const Array& arr, const std::string& str) { std::cout << "Array " << str << ":\n"; @@ -330,8 +345,9 @@ int run( const double ymin = -1, ymax = 1; ArakawaCGrid grid(xmin, xmax, ymin, ymax, nx, ny); - std::size_t halo_radius = 1 * 2; // 1 - halo size, 2 - redundancy - auto dist = dr::mp::distribution().halo(halo_radius); + auto dist = dr::mp::distribution() + .halo(1) + .redundancy(2); // statistics std::size_t nread, nwrite, nflop; @@ -397,34 +413,12 @@ int run( Array dudt({nx + 1, ny}, dist); Array dvdt({nx + 1, ny + 1}, dist); - // TODO: figure out smaller views for each of arrays from above - // First phase runs on normal arrays - // Second phase runs on smaller arrays (smaller only by 1 (redundancy)), but only on smaller in first dimension - std::array smaller_view_start_e{1, 0}; - std::array smaller_view_end_e{e.extent(0) - 1, e.extent(1)}; - std::array smaller_view_start_u{1, 0}; - std::array smaller_view_end_u{u.extent(0) - 1, u.extent(1)}; - std::array smaller_view_start_v{1, 0}; - std::array smaller_view_end_v{v.extent(0) - 1, v.extent(1)}; - auto e_smaller_view = dr::mp::views::submdspan(e.view(), smaller_view_start_e, smaller_view_end_e); - auto u_smaller_view = dr::mp::views::submdspan(u.view(), smaller_view_start_u, smaller_view_end_u); - auto v_smaller_view = dr::mp::views::submdspan(v.view(), smaller_view_start_v, smaller_view_end_v); - auto e1_smaller_view = dr::mp::views::submdspan(e1.view(), smaller_view_start_e, smaller_view_end_e); - auto u1_smaller_view = dr::mp::views::submdspan(u1.view(), smaller_view_start_u, smaller_view_end_u); - auto v1_smaller_view = dr::mp::views::submdspan(v1.view(), smaller_view_start_v, smaller_view_end_v); - auto e2_smaller_view = dr::mp::views::submdspan(e2.view(), smaller_view_start_e, smaller_view_end_e); - auto u2_smaller_view = dr::mp::views::submdspan(u2.view(), smaller_view_start_u, smaller_view_end_u); - auto v2_smaller_view = dr::mp::views::submdspan(v2.view(), smaller_view_start_v, smaller_view_end_v); - auto dedt_smaller_view = dr::mp::views::submdspan(dedt.view(), smaller_view_start_e, smaller_view_end_e); - auto dudt_smaller_view = dr::mp::views::submdspan(dudt.view(), smaller_view_start_u, smaller_view_end_u); - auto dvdt_smaller_view = dr::mp::views::submdspan(dvdt.view(), smaller_view_start_v, smaller_view_end_v); - dr::mp::fill(dedt, 0); dr::mp::fill(dudt, 0); dr::mp::fill(dvdt, 0); - dr::mp::halo(dedt).exchange_begin(); - dr::mp::halo(dudt).exchange_begin(); - dr::mp::halo(dvdt).exchange_begin(); + dr::mp::halo(dedt).exchange(); + dr::mp::halo(dudt).exchange(); + dr::mp::halo(dvdt).exchange(); auto init_op = [xmin, ymin, grid](auto index, auto v) { auto &[o] = v; @@ -439,19 +433,6 @@ int run( }; dr::mp::for_each(init_op, e); - dr::mp::halo(e).exchange_begin(); - dr::mp::halo(u).exchange_begin(); - dr::mp::halo(v).exchange_begin(); - -// if (!fused_kernels) { -// dr::mp::halo(e1).exchange_begin(); -// dr::mp::halo(u1).exchange_begin(); -// dr::mp::halo(v1).exchange_begin(); -// dr::mp::halo(e2).exchange_begin(); -// dr::mp::halo(u2).exchange_begin(); -// dr::mp::halo(v2).exchange_begin(); -// } - auto add = [](auto ops) { return ops.first + ops.second; }; auto max = [](double x, double y) { return std::max(x, y); }; auto rk_update2 = [](auto ops) { @@ -511,86 +492,63 @@ int run( dt); stage3(u, v, e, u2, v2, e2, g, h, grid.dx_inv, grid.dy_inv, dt); } else { - // First phase without communication if (i % 2 == 1) { +#ifdef DEBUG + std::cout << "no communication\n"; +#endif + // First phase without communication + // RK stage 1: u1 = u + dt*rhs(u) - rhs(u, v, e, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt, 0); + rhs(u, v, e, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt); dr::mp::transform(dr::mp::views::zip(u, dudt), u1.begin(), add); dr::mp::transform(dr::mp::views::zip(v, dvdt), v1.begin(), add); dr::mp::transform(dr::mp::views::zip(e, dedt), e1.begin(), add); -// dr::mp::transform(dr::mp::views::zip(u_smaller_view, dudt_smaller_view), u1_smaller_view.begin(), add); -// dr::mp::transform(dr::mp::views::zip(v_smaller_view, dvdt_smaller_view), v1_smaller_view.begin(), add); -// dr::mp::transform(dr::mp::views::zip(e_smaller_view, dedt_smaller_view), e1_smaller_view.begin(), add); // RK stage 2: u2 = 0.75*u + 0.25*(u1 + dt*rhs(u1)) - rhs(u1, v1, e1, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt, 0); + rhs(u1, v1, e1, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt); dr::mp::transform(dr::mp::views::zip(u, u1, dudt), u2.begin(), rk_update2); dr::mp::transform(dr::mp::views::zip(v, v1, dvdt), v2.begin(), rk_update2); dr::mp::transform(dr::mp::views::zip(e, e1, dedt), e2.begin(), rk_update2); -// dr::mp::transform(dr::mp::views::zip(u_smaller_view, u1_smaller_view, dudt_smaller_view),u2_smaller_view.begin(), rk_update2); -// dr::mp::transform(dr::mp::views::zip(v_smaller_view, v1_smaller_view, dvdt_smaller_view),v2_smaller_view.begin(), rk_update2); -// dr::mp::transform(dr::mp::views::zip(e_smaller_view, e1_smaller_view, dedt_smaller_view),e2_smaller_view.begin(), rk_update2); // RK stage 3: u3 = 1/3*u + 2/3*(u2 + dt*rhs(u2)) - rhs(u2, v2, e2, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt, 0); + rhs(u2, v2, e2, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt); dr::mp::transform(dr::mp::views::zip(u, u2, dudt), u.begin(), rk_update3); dr::mp::transform(dr::mp::views::zip(v, v2, dvdt), v.begin(), rk_update3); dr::mp::transform(dr::mp::views::zip(e, e2, dedt), e.begin(), rk_update3); -// dr::mp::transform(dr::mp::views::zip(u_smaller_view, u2_smaller_view, dudt_smaller_view),u_smaller_view.begin(), rk_update3); -// dr::mp::transform(dr::mp::views::zip(v_smaller_view, v2_smaller_view, dvdt_smaller_view),v_smaller_view.begin(), rk_update3); -// dr::mp::transform(dr::mp::views::zip(e_smaller_view, e2_smaller_view, dedt_smaller_view),e_smaller_view.begin(), rk_update3); } else { - dr::mp::halo(e).exchange_finalize(); - dr::mp::halo(u).exchange_finalize(); - dr::mp::halo(v).exchange_finalize(); +#ifdef DEBUG + std::cout << "exchanges\n"; +#endif // Second phase with communication + dr::mp::halo(e).exchange(); + dr::mp::halo(u).exchange(); + dr::mp::halo(v).exchange(); + // RK stage 1: u1 = u + dt*rhs(u) - rhs(u, v, e, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt, 0); -// dr::mp::transform(dr::mp::views::zip(u, dudt), u1.begin(), add); -// dr::mp::transform(dr::mp::views::zip(v, dvdt), v1.begin(), add); -// dr::mp::transform(dr::mp::views::zip(e, dedt), e1.begin(), add); - dr::mp::transform(dr::mp::views::zip(u_smaller_view, dudt_smaller_view), u1_smaller_view.begin(), add); - dr::mp::transform(dr::mp::views::zip(v_smaller_view, dvdt_smaller_view), v1_smaller_view.begin(), add); - dr::mp::transform(dr::mp::views::zip(e_smaller_view, dedt_smaller_view), e1_smaller_view.begin(), add); - - dr::mp::halo(u1).exchange_begin(); - dr::mp::halo(v1).exchange_begin(); - dr::mp::halo(e1).exchange_begin(); - - dr::mp::halo(u1).exchange_finalize(); - dr::mp::halo(v1).exchange_finalize(); - dr::mp::halo(e1).exchange_finalize(); + rhs(u, v, e, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt); + dr::mp::transform(dr::mp::views::zip(u, dudt), u1.begin(), add); + dr::mp::transform(dr::mp::views::zip(v, dvdt), v1.begin(), add); + dr::mp::transform(dr::mp::views::zip(e, dedt), e1.begin(), add); + + dr::mp::halo(u1).exchange(); + dr::mp::halo(v1).exchange(); + dr::mp::halo(e1).exchange(); // RK stage 2: u2 = 0.75*u + 0.25*(u1 + dt*rhs(u1)) - rhs(u1, v1, e1, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt, 0); -// dr::mp::transform(dr::mp::views::zip(u, u1, dudt), u2.begin(), rk_update2); -// dr::mp::transform(dr::mp::views::zip(v, v1, dvdt), v2.begin(), rk_update2); -// dr::mp::transform(dr::mp::views::zip(e, e1, dedt), e2.begin(), rk_update2); - dr::mp::transform(dr::mp::views::zip(u_smaller_view, u1_smaller_view, dudt_smaller_view),u2_smaller_view.begin(), rk_update2); - dr::mp::transform(dr::mp::views::zip(v_smaller_view, v1_smaller_view, dvdt_smaller_view),v2_smaller_view.begin(), rk_update2); - dr::mp::transform(dr::mp::views::zip(e_smaller_view, e1_smaller_view, dedt_smaller_view),e2_smaller_view.begin(), rk_update2); - - dr::mp::halo(u2).exchange_begin(); - dr::mp::halo(v2).exchange_begin(); - dr::mp::halo(e2).exchange_begin(); - - dr::mp::halo(u2).exchange_finalize(); - dr::mp::halo(v2).exchange_finalize(); - dr::mp::halo(e2).exchange_finalize(); + rhs(u1, v1, e1, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt); + dr::mp::transform(dr::mp::views::zip(u, u1, dudt),u2.begin(), rk_update2); + dr::mp::transform(dr::mp::views::zip(v, v1, dvdt),v2.begin(), rk_update2); + dr::mp::transform(dr::mp::views::zip(e, e1, dedt),e2.begin(), rk_update2); - // RK stage 3: u3 = 1/3*u + 2/3*(u2 + dt*rhs(u2)) - rhs(u2, v2, e2, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt, 0); -// dr::mp::transform(dr::mp::views::zip(u, u2, dudt), u.begin(), rk_update3); -// dr::mp::transform(dr::mp::views::zip(v, v2, dvdt), v.begin(), rk_update3); -// dr::mp::transform(dr::mp::views::zip(e, e2, dedt), e.begin(), rk_update3); - dr::mp::transform(dr::mp::views::zip(u_smaller_view, u2_smaller_view, dudt_smaller_view),u_smaller_view.begin(), rk_update3); - dr::mp::transform(dr::mp::views::zip(v_smaller_view, v2_smaller_view, dvdt_smaller_view),v_smaller_view.begin(), rk_update3); - dr::mp::transform(dr::mp::views::zip(e_smaller_view, e2_smaller_view, dedt_smaller_view),e_smaller_view.begin(), rk_update3); - - dr::mp::halo(u).exchange_begin(); - dr::mp::halo(v).exchange_begin(); - dr::mp::halo(e).exchange_begin(); + dr::mp::halo(u2).exchange(); + dr::mp::halo(v2).exchange(); + dr::mp::halo(e2).exchange(); + // RK stage 3: u3 = 1/3*u + 2/3*(u2 + dt*rhs(u2)) + rhs(u2, v2, e2, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt); + dr::mp::transform(dr::mp::views::zip(u, u2, dudt),u.begin(), rk_update3); + dr::mp::transform(dr::mp::views::zip(v, v2, dvdt),v.begin(), rk_update3); + dr::mp::transform(dr::mp::views::zip(e, e2, dedt),e.begin(), rk_update3); } #ifdef DEBUG std::cout << "Iter " << i << "\n"; @@ -609,9 +567,18 @@ int run( #endif } } - dr::mp::halo(u).exchange_finalize(); - dr::mp::halo(v).exchange_finalize(); - dr::mp::halo(e).exchange_finalize(); + + dr::mp::halo(e).exchange(); + dr::mp::halo(u).exchange(); + dr::mp::halo(v).exchange(); + dr::mp::halo(u1).exchange(); + dr::mp::halo(v1).exchange(); + dr::mp::halo(e1).exchange(); + dr::mp::halo(u2).exchange(); + dr::mp::halo(v2).exchange(); + dr::mp::halo(e2).exchange(); + + auto toc = std::chrono::steady_clock::now(); std::chrono::duration duration = toc - tic; if (comm_rank == 0) { diff --git a/benchmarks/gbench/mp/wave_equation.cpp b/benchmarks/gbench/mp/wave_equation.cpp index 4b8dc33c92..71ac12bba9 100644 --- a/benchmarks/gbench/mp/wave_equation.cpp +++ b/benchmarks/gbench/mp/wave_equation.cpp @@ -71,6 +71,8 @@ double initial_elev(double x, double y, double lx, double ly) { return exact_elev(x, y, 0.0, lx, ly); } +//#define DEBUG + void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, double g, double h, double dx_inv, double dy_inv, double dt) { /** @@ -82,6 +84,9 @@ void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, out(0, 0) = -dt * g * (in(1, 0) - in(0, 0)) * dx_inv; }; { +#ifdef DEBUG + std::cout << "stage1\n"; +#endif std::array start{1, 0}; std::array end{e.extent(0) - 1, e.extent(1)}; auto e_view = dr::mp::views::submdspan(e.view(), start, end); @@ -94,6 +99,9 @@ void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, out(0, 0) = -dt * g * (in(0, 0) - in(0, -1)) * dy_inv; }; { +#ifdef DEBUG + std::cout << "stage2\n"; +#endif std::array start{0, 1}; std::array end{e.extent(0), e.extent(1)}; auto e_view = dr::mp::views::submdspan(e.view(), start, end); @@ -110,6 +118,9 @@ void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, out(0, 0) = -dt * h * (dudx + dvdy); }; { +#ifdef DEBUG + std::cout << "stage3\n"; +#endif std::array start{1, 0}; std::array end{u.extent(0), u.extent(1)}; auto u_view = dr::mp::views::submdspan(u.view(), start, end); @@ -117,6 +128,9 @@ void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, auto dedt_view = dr::mp::views::submdspan(dedt.view(), start, end); dr::mp::stencil_for_each(rhs_div, u_view, v_view, dedt_view); } +#ifdef DEBUG + std::cout << "after\n"; +#endif }; void stage1(Array &u, Array &v, Array &e, Array &u1, Array &v1, Array &e1, @@ -313,8 +327,6 @@ void stage3(Array &u, Array &v, Array &e, Array &u2, Array &v2, Array &e2, dr::mp::halo(e).exchange_begin(); }; -//#define DEBUG - #ifdef DEBUG void debug_print_arr(std::size_t n, std::size_t m, const Array& arr, const std::string& str) { std::cout << "Array " << str << ":\n"; diff --git a/include/dr/mp/algorithms/for_each.hpp b/include/dr/mp/algorithms/for_each.hpp index e57dd03e89..62f54b88d6 100644 --- a/include/dr/mp/algorithms/for_each.hpp +++ b/include/dr/mp/algorithms/for_each.hpp @@ -92,7 +92,7 @@ namespace __detail { seg_info.second ); }); - op(stencils, index); + op(stencils); }; if (mp::use_sycl()) { dr::drlog.debug(" using sycl\n"); @@ -113,10 +113,14 @@ namespace __detail { } } - void stencil_for_each_extended_2(auto op, stencil_index_type<1> begin, stencil_index_type<1> end, const auto& segs) { + void stencil_for_each_extended_2(auto op, stencil_index_type<2>& begin, stencil_index_type<2> end, const auto& segs) { auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); - auto sub = [](auto a) { return std::get<1>(a) - std::get<0>(a); }; + auto sub = [](auto a) { + auto x = std::get<0>(a); + auto y = std::get<1>(a); + return y > x ? y - x : 0; + }; auto is_zero = [](auto a) { return a != 0; }; auto zipped = zip_view(seg0_begin, seg0_end); @@ -125,39 +129,50 @@ namespace __detail { if ((distance | std::views::filter(is_zero)).empty()) return; - auto seg_infos = dr::__detail::tuple_transform(segs, [begin](auto &&seg) { - return std::make_pair(seg.begin() + seg.begin_stencil(begin)[0], seg.extents()); + auto seg_infos = dr::__detail::tuple_transform(segs, [&begin](auto &&seg) { + auto ext = seg.root_mdspan().extents(); + auto begin_stencil = seg.begin_stencil(begin); + return std::make_pair( + md::mdspan( + std::to_address(&seg.mdspan_extended()(begin_stencil[0], begin_stencil[1])), + ext + ), ext); }); auto do_point = [seg_infos, op](auto index) { auto stencils = dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { return md::mdspan( - std::to_address(dr::ranges::local(seg_info.first + index)), - seg_info.second - ); + std::to_address(&seg_info.first(index[0], index[1])), + seg_info.second); }); - op(stencils, index); + op(stencils); }; - if (mp::use_sycl()) { - dr::drlog.debug(" using sycl\n"); - -#ifdef SYCL_LANGUAGE_VERSION - dr::__detail::parallel_for( - dr::mp::sycl_queue(), sycl::range<2>(distance[0], distance[1]), - do_point) - .wait(); -#else - assert(false); -#endif - } else { - dr::drlog.debug(" using cpu\n"); +// if (mp::use_sycl()) { +// +//#ifdef SYCL_LANGUAGE_VERSION +// dr::__detail::parallel_for( +// dr::mp::sycl_queue(), sycl::range<2>(distance[0], distance[1]), +// do_point) +// .wait(); +//#else +// assert(false); +//#endif +// } else { for (std::size_t i = 0; i < distance[0]; i++) { - for (std::size_t i = 0; i < distance[1]; i++) { - do_point(i); + for (std::size_t j = 0; j < distance[1]; j++) { + auto seg0 = std::get<0>(segs); + auto origin0 = seg0.origin(); + auto begin0 = seg0_begin; + std::cout << origin0[0] + i + begin0[0] << " " << origin0[1] + j + begin0[1] << "\n"; +// auto seg1 = std::get<1>(segs); +// auto origin1 = seg1.origin(); +// auto begin1 = seg1.begin_stencil(begin); +// std::cout << "snd " << origin1[0] + i + begin1[0] << " " << origin1[1] + j + begin1[1] << "\n"; + do_point(stencil_index_type<2>{i, j}); } } - } +// } } } diff --git a/include/dr/mp/algorithms/md_for_each.hpp b/include/dr/mp/algorithms/md_for_each.hpp index 9d92f0fe98..5e145f91c7 100644 --- a/include/dr/mp/algorithms/md_for_each.hpp +++ b/include/dr/mp/algorithms/md_for_each.hpp @@ -103,6 +103,7 @@ void stencil_for_each(auto op, is_mdspan_view auto &&...drs) { #else for (std::size_t i = 0; i < mdspan0.extents().extent(0); i++) { for (std::size_t j = 0; j < mdspan0.extents().extent(1); j++) { +// std::cout << seg0.origin()[0] + i << " " << seg0.origin()[1] + j << "\n"; invoke_index(std::array{i, j}); } } diff --git a/include/dr/mp/containers/distributed_mdarray.hpp b/include/dr/mp/containers/distributed_mdarray.hpp index 14cfdd2945..3ad0a25c11 100644 --- a/include/dr/mp/containers/distributed_mdarray.hpp +++ b/include/dr/mp/containers/distributed_mdarray.hpp @@ -17,20 +17,21 @@ template class distributed_mdarray { distributed_mdarray(dr::__detail::dr_extents shape, distribution dist = distribution()) : tile_shape_(tile_shape(shape)), dv_(dv_size(), dv_dist(dist, shape)), - md_view_(make_md_view(dv_, shape, tile_shape_)) {} + md_view_(make_md_view(dv_, shape, tile_shape_, dist)), dist_(dist) {} auto begin() const { return rng::begin(md_view_); } auto end() const { return rng::end(md_view_); } auto size() const { return rng::size(md_view_); } auto operator[](auto n) { return md_view_[n]; } - auto segments() { return dr::ranges::segments(md_view_); } + auto segments() const { return dr::ranges::segments(md_view_); } auto &halo() const { return dr::mp::halo(dv_); } auto mdspan() const { return md_view_.mdspan(); } auto extent(std::size_t r) const { return mdspan().extent(r); } auto grid() { return md_view_.grid(); } auto view() const { return md_view_; } + auto dist() const { return dist_; } auto operator==(const distributed_mdarray &other) const { return std::equal(begin(), end(), other.begin()); @@ -70,16 +71,17 @@ template class distributed_mdarray { // This wrapper seems to avoid an issue with template argument // deduction for mdspan_view static auto make_md_view(const DV &dv, shape_type shape, - shape_type tile_shape) { - return views::mdspan(dv, shape, tile_shape); + shape_type tile_shape, distribution dist) { + return views::mdspan(dv, shape, tile_shape, dist); } shape_type tile_shape_; DV dv_; using mdspan_type = decltype(make_md_view(std::declval(), std::declval(), - std::declval())); + std::declval(), std::declval())); mdspan_type md_view_; + distribution dist_; }; template diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp index 02f0244992..8e799b79d8 100644 --- a/include/dr/mp/containers/distributed_vector.hpp +++ b/include/dr/mp/containers/distributed_vector.hpp @@ -276,7 +276,7 @@ template class distributed_vector { void fence() { backend.fence(); } - auto &dist() const { + const auto &dist() const { return distribution_; } private: @@ -295,13 +295,7 @@ template class distributed_vector { segment_size_ = gran * std::max({(size / gran + comm_size - 1) / comm_size, hb.prev / gran, hb.next / gran}); - __detail::extended_local_data_distribution ext_dist; - if (default_comm().rank() * segment_size_ >= hb.prev) - ext_dist.begin = default_comm().rank() * segment_size_ - hb.prev; - else - ext_dist.begin = 0; - ext_dist.end = std::min((default_comm().rank() + 1) * segment_size_ + hb.next, size_); - ext_dist.segment_size = segment_size_; + extended_local_data_distribution ext_dist(segment_size_, size_, hb); data_size_ = segment_size_ + hb.prev + hb.next; diff --git a/include/dr/mp/containers/distribution.hpp b/include/dr/mp/containers/distribution.hpp index d933520dd4..2fe0049b07 100644 --- a/include/dr/mp/containers/distribution.hpp +++ b/include/dr/mp/containers/distribution.hpp @@ -56,4 +56,22 @@ struct distribution { std::size_t granularity_ = 1; }; +struct extended_local_data_distribution { + std::size_t begin; + std::size_t end; + std::size_t segment_size; + + extended_local_data_distribution() = default; + extended_local_data_distribution(std::size_t segment_size, + std::size_t size, + halo_bounds hb) + : segment_size(segment_size) { + if (default_comm().rank() * segment_size >= hb.prev) + begin = default_comm().rank() * segment_size - hb.prev; + else + begin = 0; + end = std::min((default_comm().rank() + 1) * segment_size + hb.next, size); + } +}; + } // namespace dr::mp diff --git a/include/dr/mp/containers/segment.hpp b/include/dr/mp/containers/segment.hpp index 5f6b4f020a..1318ecf53d 100644 --- a/include/dr/mp/containers/segment.hpp +++ b/include/dr/mp/containers/segment.hpp @@ -6,14 +6,6 @@ namespace dr::mp { -namespace __detail { - struct extended_local_data_distribution { - std::size_t begin; - std::size_t end; - std::size_t segment_size; - }; -} // __detail - template class dv_segment_iterator; template class dv_segment_reference { @@ -228,7 +220,7 @@ template class dv_segment { using difference_type = std::ptrdiff_t; dv_segment() = default; dv_segment(DV *dv, std::size_t segment_index, std::size_t size, - std::size_t reserved, const __detail::extended_local_data_distribution& ext_dist) { + std::size_t reserved, const extended_local_data_distribution& ext_dist) { dv_ = dv; segment_index_ = segment_index; size_ = size; @@ -236,7 +228,7 @@ template class dv_segment { ext_dist_ = ext_dist; begin_index_ = segment_index * ext_dist.segment_size; - end_index_ = segment_index * ext_dist.segment_size + size_; + end_index_ = begin_index_ + size_; assert(dv_ != nullptr); } @@ -272,7 +264,7 @@ template class dv_segment { std::size_t begin_index_; std::size_t end_index_; - __detail::extended_local_data_distribution ext_dist_; + extended_local_data_distribution ext_dist_; }; // dv_segment // diff --git a/include/dr/mp/views/mdspan_view.hpp b/include/dr/mp/views/mdspan_view.hpp index 1a67f8d78f..568d9b6c5f 100644 --- a/include/dr/mp/views/mdspan_view.hpp +++ b/include/dr/mp/views/mdspan_view.hpp @@ -25,16 +25,22 @@ namespace dr::mp::__detail { template class md_segment : public rng::view_interface> { private: + using stencil_index_type = dr::__detail::dr_extents; public: using index_type = dr::__detail::dr_extents; md_segment() {} - md_segment(index_type origin, BaseSegment segment, index_type tile_shape) + md_segment(index_type origin, BaseSegment segment, index_type tile_shape, extended_local_data_distribution ext_dist) : base_(segment), origin_(origin), - mdspan_(local_tile(segment, tile_shape)) { + mdspan_(local_tile(segment, tile_shape)), + mdspan_extended_(local_tile_extended(segment, tile_shape)), + ext_dist_(ext_dist) { dr::drlog.debug(dr::logger::mdspan_view, "md_segment\n origin: {}\n tile shape: {}\n", origin, tile_shape); + + for (std::size_t i = 0; i < Rank; i++) + end_[i] = origin_[i] + tile_shape[i]; } // view_interface uses below to define everything else @@ -45,8 +51,29 @@ class md_segment : public rng::view_interface> { auto halo() const { return dr::mp::halo(base_); } + [[nodiscard]] stencil_index_type begin_stencil(stencil_index_type stencil) const { + stencil_index_type out; + // Supports only 1d distribution + for (std::size_t i = 0; i < Rank; i++) { + out[i] = std::min(std::max(origin_[i], (i == 0 ? ext_dist_.begin : origin_[i]) + stencil[i]), end_[i]) - origin_[i]; + } + return out; + } + [[nodiscard]] stencil_index_type end_stencil(stencil_index_type stencil) const { + stencil_index_type out; + // Supports only 1d distribution + for (std::size_t i = 0; i < Rank; i++) { + out[i] = std::max(std::min(end_[i], (i == 0 ? ext_dist_.end : end_[i]) - stencil[i]), origin_[i]) - origin_[i]; + } + return out; + } + [[nodiscard]] std::pair stencil(stencil_index_type begin, stencil_index_type end) const { + return {begin_stencil(begin), end_stencil(end)}; + } + // mdspan-specific methods auto mdspan() const { return mdspan_; } + auto mdspan_extended() const { return mdspan_extended_; } auto origin() const { return origin_; } // for slices, this would be the underlying mdspan auto root_mdspan() const { return mdspan(); } @@ -62,9 +89,19 @@ class md_segment : public rng::view_interface> { return md::mdspan(ptr, tile_shape); } + static auto local_tile_extended(BaseSegment segment, const index_type &tile_shape) { + // Undefined behavior if the segments is not local + T *ptr = std::to_address(dr::ranges::local(rng::begin(segment))); + return md::mdspan(ptr, tile_shape); + } + BaseSegment base_; index_type origin_; + index_type end_; md::mdspan, md::layout_stride> mdspan_; + md::mdspan, md::layout_stride> mdspan_extended_; + + extended_local_data_distribution ext_dist_; }; } // namespace dr::mp::__detail @@ -107,7 +144,9 @@ struct mdspan_view : public rng::view_interface> { return origin; } - static auto make_segments(auto base, auto full_shape, auto tile_shape) { + static auto make_segments(auto base, auto full_shape, auto tile_shape, auto dist) { + extended_local_data_distribution ext_dist(tile_shape[0], full_shape[0], dist.halo()); + auto make_md = [=](auto v) { auto clipped = tile_shape; std::size_t segment_index = std::get<0>(v); @@ -117,7 +156,7 @@ struct mdspan_view : public rng::view_interface> { } return __detail::md_segment( segment_index_to_global_origin(segment_index, full_shape, tile_shape), - std::get<1>(v), clipped); + std::get<1>(v), clipped, ext_dist); }; dr::drlog.debug(dr::logger::mdspan_view, @@ -128,10 +167,10 @@ struct mdspan_view : public rng::view_interface> { rng::views::transform(make_md); } using segments_type = decltype(make_segments(std::declval(), - full_shape_, tile_shape_)); + full_shape_, tile_shape_, std::declval())); public: - mdspan_view(R r, dr::__detail::dr_extents full_shape) + mdspan_view(R r, dr::__detail::dr_extents full_shape, distribution dist) : base_(rng::views::all(std::forward(r))) { full_shape_ = full_shape; @@ -140,15 +179,15 @@ struct mdspan_view : public rng::view_interface> { tile_shape_[0] = decomp::div; replace_decomp(); - segments_ = make_segments(base_, full_shape_, tile_shape_); + segments_ = make_segments(base_, full_shape_, tile_shape_, dist); } mdspan_view(R r, dr::__detail::dr_extents full_shape, - dr::__detail::dr_extents tile_shape) + dr::__detail::dr_extents tile_shape, distribution dist) : base_(rng::views::all(std::forward(r))), full_shape_(full_shape), tile_shape_(tile_shape) { replace_decomp(); - segments_ = make_segments(base_, full_shape_, tile_shape_); + segments_ = make_segments(base_, full_shape_, tile_shape_, dist); } // Base implements random access range @@ -194,12 +233,12 @@ struct mdspan_view : public rng::view_interface> { }; template -mdspan_view(R &&r, dr::__detail::dr_extents extents) +mdspan_view(R &&r, dr::__detail::dr_extents extents, distribution dist) -> mdspan_view, Rank>; template mdspan_view(R &&r, dr::__detail::dr_extents full_shape, - dr::__detail::dr_extents tile_shape) + dr::__detail::dr_extents tile_shape, distribution dist) -> mdspan_view, Rank>; template @@ -213,17 +252,19 @@ namespace dr::mp::views { template class mdspan_adapter_closure { public: mdspan_adapter_closure(dr::__detail::dr_extents full_shape, - dr::__detail::dr_extents tile_shape) - : full_shape_(full_shape), tile_shape_(tile_shape), tile_valid_(true) {} + dr::__detail::dr_extents tile_shape, + distribution dist) + : full_shape_(full_shape), tile_shape_(tile_shape), tile_valid_(true), dist_(dist) {} - mdspan_adapter_closure(dr::__detail::dr_extents full_shape) - : full_shape_(full_shape) {} + mdspan_adapter_closure(dr::__detail::dr_extents full_shape, + distribution dist) + : full_shape_(full_shape), dist_(dist) {} template auto operator()(R &&r) const { if (tile_valid_) { - return mdspan_view(std::forward(r), full_shape_, tile_shape_); + return mdspan_view(std::forward(r), full_shape_, tile_shape_, dist_); } else { - return mdspan_view(std::forward(r), full_shape_); + return mdspan_view(std::forward(r), full_shape_, dist_); } } @@ -236,31 +277,34 @@ template class mdspan_adapter_closure { dr::__detail::dr_extents full_shape_; dr::__detail::dr_extents tile_shape_; bool tile_valid_ = false; + distribution dist_; }; class mdspan_fn_ { public: template - auto operator()(R &&r, Shape &&full_shape, Shape &&tile_shape) const { + auto operator()(R &&r, Shape &&full_shape, Shape &&tile_shape, distribution dist) const { return mdspan_adapter_closure(std::forward(full_shape), - std::forward(tile_shape))( + std::forward(tile_shape), + dist)( std::forward(r)); } template - auto operator()(R &&r, Shape &&full_shape) const { - return mdspan_adapter_closure(std::forward(full_shape))( + auto operator()(R &&r, Shape &&full_shape, distribution dist) const { + return mdspan_adapter_closure(std::forward(full_shape), dist)( std::forward(r)); } template - auto operator()(Shape &&full_shape, Shape &&tile_shape) const { + auto operator()(Shape &&full_shape, Shape &&tile_shape, distribution dist) const { return mdspan_adapter_closure(std::forward(full_shape), - std::forward(tile_shape)); + std::forward(tile_shape), + dist); } - template auto operator()(Shape &&full_shape) const { - return mdspan_adapter_closure(std::forward(full_shape)); + template auto operator()(Shape &&full_shape, distribution dist) const { + return mdspan_adapter_closure(std::forward(full_shape), dist); } }; diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index 4868dfb81c..6e1be792fa 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -67,7 +67,7 @@ add_executable(mp-quick-test ) add_executable(mp-quick-test-3-only mp-tests.cpp - wide-halo-1d-3.cpp + wide-halo-2d-3.cpp ) # cmake-format: on diff --git a/test/gtest/mp/wide-halo-1d-3.cpp b/test/gtest/mp/wide-halo-1d-3.cpp index 7b0d4890b8..20717ce74c 100644 --- a/test/gtest/mp/wide-halo-1d-3.cpp +++ b/test/gtest/mp/wide-halo-1d-3.cpp @@ -46,11 +46,11 @@ TEST(WideHalo3, halo_is_visible_after_exchange_not_earlier) { }; auto transform = [&]{ - stencil_for_each_extended<1>([](auto stencils, auto id){ + stencil_for_each_extended<1>([](auto stencils){ auto [x, x_out] = stencils; x_out(0) = x(-1) + x(0) + x(1); }, {1}, {1}, dv, dv_out); - stencil_for_each_extended<1>([](auto stencils, auto id){ + stencil_for_each_extended<1>([](auto stencils){ auto [x, x_out] = stencils; x(0) = x_out(0); }, {0}, {0}, dv, dv_out); @@ -150,11 +150,11 @@ TEST(WideHalo3, halo_api_works) { dv_out.halo().exchange(); halo_exchange([](Array& dv, Array& dv_out){ - stencil_for_each_extended<1>([](auto stencils, auto id){ + stencil_for_each_extended<1>([](auto stencils){ auto [x, x_out] = stencils; x_out(0) = x(-1) + x(0) + x(1); }, {1}, {1}, dv, dv_out); - stencil_for_each_extended<1>([](auto stencils, auto id){ + stencil_for_each_extended<1>([](auto stencils){ auto [x, x_out] = stencils; x(0) = x_out(0); }, {0}, {0}, dv, dv_out); diff --git a/test/gtest/mp/wide-halo-2d-3.cpp b/test/gtest/mp/wide-halo-2d-3.cpp new file mode 100644 index 0000000000..99ac5304ca --- /dev/null +++ b/test/gtest/mp/wide-halo-2d-3.cpp @@ -0,0 +1,125 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" + +template class WideHalo3 : public testing::Test {}; + +using T = int; +using Array = dr::mp::distributed_mdarray; + +const std::size_t redundancy = 2; +const std::array size = {6, 6}; + +dr::mp::distribution get_distribution() { + return dr::mp::distribution() + .halo(1) + .redundancy(redundancy); +} + +//int& get(Array& v, std::size_t i, std::size_t j) { +// return *(v.begin() + i * 6 + j).local(); +//} +// +//const int& get(const Array& v, std::size_t i, std::size_t j) { +// return *(v.begin() + i * 6 + j).local(); +//} + +TEST(WideHalo3, suite_works_for_3_processes_only) { + EXPECT_EQ(dr::mp::default_comm().size(), 3); +} + +TEST(WideHalo3, halo2d_is_visible_after_exchange_not_earlier) { + dr::mp::distribution dist = get_distribution(); + Array dv(size, dist); + Array dv_out(size, dist); + + fill(dv, 1); + fill(dv_out, 1); + dv.halo().exchange(); + dv_out.halo().exchange(); + + auto transform = [&]{ + stencil_for_each_extended<2>([](auto stencils){ + auto [x, x_out] = stencils; + x_out(0, 0) = 0; + for (int i = -1; i <= 1; i++) { + for (int j = -1; j <= 1; j++) { + x_out(0, 0) += x(i, j); + } + } + }, {1, 1}, {1, 1}, dv, dv_out); + stencil_for_each_extended<2>([](auto stencils){ + auto [x, x_out] = stencils; + x(0, 0) = x_out(0, 0); + }, {0, 0}, {0, 0}, dv, dv_out); + }; + auto print = [](std::string s, const auto& v) { + std::cout << s << "\n"; + for (auto seg : v.segments()) { + auto [beg, end] = seg.stencil({0, 0}, {0, 0}); + for (std::size_t i = beg[0]; i < end[0]; i++) { + for (std::size_t j = beg[1]; j < end[1]; j++) { + std::cout << seg.mdspan_extended()(i, j) << "\t"; + } + std::cout << "\n"; + } + } + std::cout << "\n"; + }; + + print("dv", dv); + transform(); + print("dv", dv); + transform(); + print("dv", dv); + dv.halo().exchange(); + dv_out.halo().exchange(); + print("dv", dv); +} + +TEST(WideHalo3, halo2d_api_works) { + dr::mp::distribution dist = get_distribution(); + Array dv(size, dist); + Array dv_out(size, dist); + + fill(dv, 1); + fill(dv_out, 1); + dv.halo().exchange(); + dv_out.halo().exchange(); + + auto print = [](std::string s, const auto& v) { + std::cout << s << "\n"; + for (auto seg : v.segments()) { + auto [beg, end] = seg.stencil({0, 0}, {0, 0}); + for (std::size_t i = beg[0]; i < end[0]; i++) { + for (std::size_t j = beg[1]; j < end[1]; j++) { + std::cout << seg.mdspan_extended()(i, j) << "\t"; + } + std::cout << "\n"; + } + } + std::cout << "\n"; + }; + + print("dv", dv); + + halo_exchange([](Array& dv, Array& dv_out){ + stencil_for_each_extended<2>([](auto stencils){ + auto [x, x_out] = stencils; + x_out(0, 0) = 0; + for (int i = -1; i <= 1; i++) { + for (int j = -1; j <= 1; j++) { + x_out(0, 0) += x(i, j); + } + } + }, {1, 1}, {1, 1}, dv, dv_out); + stencil_for_each_extended<2>([](auto stencils){ + auto [x, x_out] = stencils; + x(0, 0) = x_out(0, 0); + }, {0, 0}, {0, 0}, dv, dv_out); + }, dv, dv_out); + + print("dv", dv); +} From adeec1235778f9822dca952a2e87c19ce375c56a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ch=C4=99tkowski?= Date: Sun, 20 Oct 2024 18:35:21 +0200 Subject: [PATCH 05/19] Code clean up --- benchmarks/gbench/mp/CMakeLists.txt | 22 +- benchmarks/gbench/mp/wave_equation-wide.cpp | 768 -------------------- benchmarks/gbench/mp/wave_equation.cpp | 43 -- benchmarks/gbench/mp/wave_equation_wide.cpp | 442 +++++++++++ include/dr/mp/algorithms/for_each.hpp | 42 +- include/dr/mp/algorithms/md_for_each.hpp | 1 - include/dr/mp/views/mdspan_view.hpp | 1 - test/gtest/mp/wide-halo-2d-3.cpp | 147 +++- 8 files changed, 615 insertions(+), 851 deletions(-) delete mode 100644 benchmarks/gbench/mp/wave_equation-wide.cpp create mode 100644 benchmarks/gbench/mp/wave_equation_wide.cpp diff --git a/benchmarks/gbench/mp/CMakeLists.txt b/benchmarks/gbench/mp/CMakeLists.txt index 6095a65807..0ad4fb096c 100644 --- a/benchmarks/gbench/mp/CMakeLists.txt +++ b/benchmarks/gbench/mp/CMakeLists.txt @@ -80,21 +80,31 @@ add_executable(wave_equation wave_equation.cpp) target_link_libraries(wave_equation cxxopts DR::mpi) target_compile_definitions(wave_equation PRIVATE STANDALONE_BENCHMARK) add_mp_ctest(NAME wave_equation) -add_executable(wave_equation-wide wave_equation-wide.cpp) -target_link_libraries(wave_equation-wide cxxopts DR::mpi) -target_compile_definitions(wave_equation-wide PRIVATE STANDALONE_BENCHMARK) -add_mp_ctest(NAME wave_equation-wide) +add_executable(wave_equation_wide wave_equation_wide.cpp) +target_link_libraries(wave_equation_wide cxxopts DR::mpi) +target_compile_definitions(wave_equation_wide PRIVATE STANDALONE_BENCHMARK) +add_mp_ctest(NAME wave_equation_wide) # add_mp_ctest(TEST_NAME wave_equation_fused NAME wave_equation TARGS -f) # # DRA-92 if(ENABLE_SYCL) add_mp_ctest( TEST_NAME wave_equation-sycl NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL) + add_mp_ctest( + TEST_NAME wave_equation-sycl-benchmark NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL TARGS -t) add_mp_ctest( TEST_NAME wave_equation_fused-sycl NAME wave_equation TIMEOUT 1000 NPROC 2 SYCL TARGS -f) add_mp_ctest( - TEST_NAME wave_equation-wide-sycl NAME wave_equation-wide TIMEOUT 1000 NPROC 8 SYCL) + TEST_NAME wave_equation_wide-sycl NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL) + foreach(redundancy RANGE 1 8) + add_mp_ctest( + TEST_NAME wave_equation_wide-sycl-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS -t 100 -r ${redundancy}) + endforeach() add_mp_ctest( - TEST_NAME wave_equation-wide_fused-sycl NAME wave_equation-wide TIMEOUT 1000 NPROC 2 SYCL TARGS -f) + TEST_NAME wave_equation_wide-sycl-gpu NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory) + foreach(redundancy RANGE 1 8) + add_mp_ctest( + TEST_NAME wave_equation_wide-sycl-gpu-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory -t 100 -r ${redundancy}) + endforeach() endif() add_executable(shallow_water shallow_water.cpp) diff --git a/benchmarks/gbench/mp/wave_equation-wide.cpp b/benchmarks/gbench/mp/wave_equation-wide.cpp deleted file mode 100644 index 9fcf3fe12d..0000000000 --- a/benchmarks/gbench/mp/wave_equation-wide.cpp +++ /dev/null @@ -1,768 +0,0 @@ -// SPDX-FileCopyrightText: Intel Corporation -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "cxxopts.hpp" -#include "dr/mp.hpp" -#include "mpi.h" -#include "wave_utils.hpp" -#include -#include - -#ifdef STANDALONE_BENCHMARK - -MPI_Comm comm; -int comm_rank; -int comm_size; - -#else - -#include "../common/dr_bench.hpp" - -#endif - -namespace WaveEquation { - -using T = double; -using Array = dr::mp::distributed_mdarray; - -// gravitational acceleration -constexpr double g = 9.81; -// water depth -constexpr double h = 1.0; - -// Get number of read/write bytes and flops for a single time step -// These numbers correspond to the fused kernel version -void calculate_complexity(std::size_t nx, std::size_t ny, std::size_t &nread, - std::size_t &nwrite, std::size_t &nflop) { - // stage1: 2+2+3 = 7 - // stage2: 3+3+4 = 10 - // stage3: 3+3+4 = 10 - nread = (27 * nx * ny) * sizeof(T); - // stage1: 3 - // stage2: 3 - // stage3: 3 - nwrite = (9 * nx * ny) * sizeof(T); - // stage1: 3+3+6 = 12 - // stage2: 6+6+9 = 21 - // stage3: 6+6+9 = 21 - nflop = 54 * nx * ny; -} - -double exact_elev(double x, double y, double t, double lx, double ly) { - /** - * Exact solution for elevation field. - * - * Returns time-dependent elevation of a 2D standing wave in a - * rectangular domain. - */ - double amp = 0.5; - double c = std::sqrt(g * h); - std::size_t n = 1; - double sol_x = std::cos(2 * n * M_PI * x / lx); - std::size_t m = 1; - double sol_y = std::cos(2 * m * M_PI * y / ly); - double omega = c * M_PI * std::hypot(n / lx, m / ly); - double sol_t = std::cos(2 * omega * t); - return amp * sol_x * sol_y * sol_t; -} - -double initial_elev(double x, double y, double lx, double ly) { - return exact_elev(x, y, 0.0, lx, ly); -} - -#define DEBUG - -void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, - double g, double h, double dx_inv, double dy_inv, double dt) { - /** - * Evaluate right hand side of the equations - */ - auto rhs_dedx = [dt, g, dx_inv](auto v) { - auto [in, out] = v; - out(0, 0) = -dt * g * (in(1, 0) - in(0, 0)) * dx_inv; - }; - { -#ifdef DEBUG - std::cout << "stage1\n"; -#endif -// std::array start{1, 0}; -// std::array end{e.extent(0) - 1, e.extent(1)}; -// auto e_view = dr::mp::views::submdspan(e.view(), start, end); -// auto dudt_view = dr::mp::views::submdspan(dudt.view(), start, end); -// dr::mp::stencil_for_each(rhs_dedx, e_view, dudt_view); - stencil_for_each_extended<2>(rhs_dedx, {0, 0}, {1, 0}, e, dudt); - } - - auto rhs_dedy = [dt, g, dy_inv](auto v) { - auto [in, out] = v; - out(0, 0) = -dt * g * (in(0, 0) - in(0, -1)) * dy_inv; - }; - { -#ifdef DEBUG - std::cout << "stage2\n"; -#endif -// std::array start{0, 1}; -// std::array end{e.extent(0), e.extent(1)}; -// auto e_view = dr::mp::views::submdspan(e.view(), start, end); -// auto dvdt_view = dr::mp::views::submdspan(dvdt.view(), start, end); -// dr::mp::stencil_for_each(rhs_dedy, e_view, dvdt_view); - stencil_for_each_extended<2>(rhs_dedy, {0, 1}, {0, 0}, e, dvdt); - } - - auto rhs_div = [dt, h, dx_inv, dy_inv](auto args) { - auto [u, v, out] = args; - auto dudx = (u(0, 0) - u(-1, 0)) * dx_inv; - auto dvdy = (v(0, 1) - v(0, 0)) * dy_inv; - out(0, 0) = -dt * h * (dudx + dvdy); - }; - { -#ifdef DEBUG - std::cout << "stage3\n"; -#endif -// std::array start{1, 0}; -// std::array end{u.extent(0), u.extent(1)}; -// auto u_view = dr::mp::views::submdspan(u.view(), start, end); -// auto v_view = dr::mp::views::submdspan(v.view(), start, end); -// auto dedt_view = dr::mp::views::submdspan(dedt.view(), start, end); -// dr::mp::stencil_for_each(rhs_div, u_view, v_view, dedt_view); - stencil_for_each_extended<2>(rhs_div, {1, 0}, {0, 0}, u, v, dedt); - } -#ifdef DEBUG - std::cout << "after\n"; -#endif -}; - -void stage1(Array &u, Array &v, Array &e, Array &u1, Array &v1, Array &e1, - double g, double h, double dx_inv, double dy_inv, double dt) { - /** - * Evaluate stage 1 of the RK time stepper - * - * u1 = u + dt*rhs(u) - * - */ - // u: elevation x gradient - dr::mp::halo(e).exchange_finalize(); - auto rhs_u1 = [dt, g, dx_inv](auto tuple) { - auto [e, u, out] = tuple; - auto dedx = (e(1, 0) - e(0, 0)) * dx_inv; - out(0, 0) = u(0, 0) - dt * g * dedx; - }; - { - std::array start{1, 0}; - std::array end{e.extent(0) - 1, e.extent(1)}; - auto e_view = dr::mp::views::submdspan(e.view(), start, end); - auto u_view = dr::mp::views::submdspan(u.view(), start, end); - auto u1_view = dr::mp::views::submdspan(u1.view(), start, end); - dr::mp::stencil_for_each(rhs_u1, e_view, u_view, u1_view); - } - dr::mp::halo(u1).exchange_begin(); - - // v: elevation y gradient - auto rhs_v1 = [dt, g, dy_inv](auto tuple) { - auto [e, v, out] = tuple; - auto dedy = (e(0, 0) - e(0, -1)) * dy_inv; - out(0, 0) = v(0, 0) - dt * g * dedy; - }; - { - std::array start{0, 1}; - std::array end{e.extent(0), e.extent(1)}; - auto e_view = dr::mp::views::submdspan(e.view(), start, end); - auto v_view = dr::mp::views::submdspan(v.view(), start, end); - auto v1_view = dr::mp::views::submdspan(v1.view(), start, end); - dr::mp::stencil_for_each(rhs_v1, e_view, v_view, v1_view); - } - dr::mp::halo(v1).exchange_begin(); - - // e: divergence of (u, v) - dr::mp::halo(u).exchange_finalize(); - dr::mp::halo(v).exchange_finalize(); - auto rhs_e1 = [dt, h, dx_inv, dy_inv](auto tuple) { - auto [e, u, v, out] = tuple; - auto dudx = (u(0, 0) - u(-1, 0)) * dx_inv; - auto dvdy = (v(0, 1) - v(0, 0)) * dy_inv; - out(0, 0) = e(0, 0) - dt * h * (dudx + dvdy); - }; - { - std::array start{1, 0}; - std::array end{u.extent(0), u.extent(1)}; - auto e_view = dr::mp::views::submdspan(e.view(), start, end); - auto u_view = dr::mp::views::submdspan(u.view(), start, end); - auto v_view = dr::mp::views::submdspan(v.view(), start, end); - auto e1_view = dr::mp::views::submdspan(e1.view(), start, end); - dr::mp::stencil_for_each(rhs_e1, e_view, u_view, v_view, e1_view); - } - dr::mp::halo(e1).exchange_begin(); -}; - -void stage2(Array &u, Array &v, Array &e, Array &u1, Array &v1, Array &e1, - Array &u2, Array &v2, Array &e2, double g, double h, double dx_inv, - double dy_inv, double dt) { - /** - * Evaluate stage 2 of the RK time stepper - * - * u2 = 0.75*u + 0.25*(u1 + dt*rhs(u1)) - * - */ - // u: elevation x gradient - dr::mp::halo(e1).exchange_finalize(); - auto rhs_u2 = [dt, g, dx_inv](auto tuple) { - auto [e1, u1, u, out] = tuple; - auto dedx = (e1(1, 0) - e1(0, 0)) * dx_inv; - out(0, 0) = 0.75 * u(0, 0) + 0.25 * (u1(0, 0) - dt * g * dedx); - }; - { - std::array start{1, 0}; - std::array end{e.extent(0) - 1, e.extent(1)}; - auto e1_view = dr::mp::views::submdspan(e1.view(), start, end); - auto u1_view = dr::mp::views::submdspan(u1.view(), start, end); - auto u_view = dr::mp::views::submdspan(u.view(), start, end); - auto u2_view = dr::mp::views::submdspan(u2.view(), start, end); - dr::mp::stencil_for_each(rhs_u2, e1_view, u1_view, u_view, u2_view); - } - dr::mp::halo(u2).exchange_begin(); - - // v: elevation y gradient - auto rhs_v2 = [dt, g, dy_inv](auto tuple) { - auto [e1, v1, v, out] = tuple; - auto dedy = (e1(0, 0) - e1(0, -1)) * dy_inv; - out(0, 0) = 0.75 * v(0, 0) + 0.25 * (v1(0, 0) - dt * g * dedy); - }; - { - std::array start{0, 1}; - std::array end{e.extent(0), e.extent(1)}; - auto e1_view = dr::mp::views::submdspan(e1.view(), start, end); - auto v1_view = dr::mp::views::submdspan(v1.view(), start, end); - auto v_view = dr::mp::views::submdspan(v.view(), start, end); - auto v2_view = dr::mp::views::submdspan(v2.view(), start, end); - dr::mp::stencil_for_each(rhs_v2, e1_view, v1_view, v_view, v2_view); - } - dr::mp::halo(v2).exchange_begin(); - - // e: divergence of (u, v) - dr::mp::halo(u1).exchange_finalize(); - dr::mp::halo(v1).exchange_finalize(); - auto rhs_e2 = [dt, h, dx_inv, dy_inv](auto tuple) { - auto [e1, u1, v1, e, out] = tuple; - auto dudx = (u1(0, 0) - u1(-1, 0)) * dx_inv; - auto dvdy = (v1(0, 1) - v1(0, 0)) * dy_inv; - out(0, 0) = 0.75 * e(0, 0) + 0.25 * (e1(0, 0) - dt * h * (dudx + dvdy)); - }; - { - std::array start{1, 0}; - std::array end{u.extent(0), u.extent(1)}; - auto e1_view = dr::mp::views::submdspan(e1.view(), start, end); - auto u1_view = dr::mp::views::submdspan(u1.view(), start, end); - auto v1_view = dr::mp::views::submdspan(v1.view(), start, end); - auto e_view = dr::mp::views::submdspan(e.view(), start, end); - auto e2_view = dr::mp::views::submdspan(e2.view(), start, end); - dr::mp::stencil_for_each(rhs_e2, e1_view, u1_view, v1_view, e_view, - e2_view); - } - dr::mp::halo(e2).exchange_begin(); -}; - -void stage3(Array &u, Array &v, Array &e, Array &u2, Array &v2, Array &e2, - double g, double h, double dx_inv, double dy_inv, double dt) { - /** - * Evaluate stage 3 of the RK time stepper - * - * u3 = 1/3*u + 2/3*(u2 + dt*rhs(u2)) - * - */ - // u: elevation x gradient - dr::mp::halo(e2).exchange_finalize(); - auto rhs_u3 = [dt, g, dx_inv](auto tuple) { - auto [e2, u2, out] = tuple; - auto dedx = (e2(1, 0) - e2(0, 0)) * dx_inv; - out(0, 0) *= 1.0 / 3; - out(0, 0) += 2.0 / 3 * (u2(0, 0) - dt * g * dedx); - }; - { - std::array start{1, 0}; - std::array end{e.extent(0) - 1, e.extent(1)}; - auto e2_view = dr::mp::views::submdspan(e2.view(), start, end); - auto u2_view = dr::mp::views::submdspan(u2.view(), start, end); - auto u_view = dr::mp::views::submdspan(u.view(), start, end); - dr::mp::stencil_for_each(rhs_u3, e2_view, u2_view, u_view); - } - dr::mp::halo(u).exchange_begin(); - - // v: elevation y gradient - auto rhs_v3 = [dt, g, dy_inv](auto tuple) { - auto [e2, v2, out] = tuple; - auto dedy = (e2(0, 0) - e2(0, -1)) * dy_inv; - out(0, 0) *= 1.0 / 3; - out(0, 0) += 2.0 / 3 * (v2(0, 0) - dt * g * dedy); - }; - { - std::array start{0, 1}; - std::array end{e.extent(0), e.extent(1)}; - auto e2_view = dr::mp::views::submdspan(e2.view(), start, end); - auto v2_view = dr::mp::views::submdspan(v2.view(), start, end); - auto v_view = dr::mp::views::submdspan(v.view(), start, end); - dr::mp::stencil_for_each(rhs_v3, e2_view, v2_view, v_view); - } - dr::mp::halo(v).exchange_begin(); - - // e: divergence of (u, v) - dr::mp::halo(u2).exchange_finalize(); - dr::mp::halo(v2).exchange_finalize(); - auto rhs_e3 = [dt, h, dx_inv, dy_inv](auto tuple) { - auto [e2, u2, v2, out] = tuple; - auto dudx = (u2(0, 0) - u2(-1, 0)) * dx_inv; - auto dvdy = (v2(0, 1) - v2(0, 0)) * dy_inv; - out(0, 0) *= 1.0 / 3; - out(0, 0) += 2.0 / 3 * (e2(0, 0) - dt * h * (dudx + dvdy)); - }; - { - std::array start{1, 0}; - std::array end{u.extent(0), u.extent(1)}; - auto e2_view = dr::mp::views::submdspan(e2.view(), start, end); - auto u2_view = dr::mp::views::submdspan(u2.view(), start, end); - auto v2_view = dr::mp::views::submdspan(v2.view(), start, end); - auto e_view = dr::mp::views::submdspan(e.view(), start, end); - dr::mp::stencil_for_each(rhs_e3, e2_view, u2_view, v2_view, e_view); - } - dr::mp::halo(e).exchange_begin(); -}; - -#ifdef DEBUG -void debug_print_arr(std::size_t n, std::size_t m, const Array& arr, const std::string& str) { - std::cout << "Array " << str << ":\n"; - std::cout << arr << "\n"; -} -#endif - -int run( - int n, bool benchmark_mode, bool fused_kernels, - std::function iter_callback = []() {}) { - // construct grid - // number of cells in x, y direction - std::size_t nx = n; - std::size_t ny = n; - const double xmin = -1, xmax = 1; - const double ymin = -1, ymax = 1; - ArakawaCGrid grid(xmin, xmax, ymin, ymax, nx, ny); - - auto dist = dr::mp::distribution() - .halo(1) - .redundancy(2); - - // statistics - std::size_t nread, nwrite, nflop; - calculate_complexity(nx, ny, nread, nwrite, nflop); - - if (comm_rank == 0) { - std::cout << "Using backend: dr" << std::endl; - if (fused_kernels) { - std::cout << "Using fused kernels" << std::endl; - } - std::cout << "Grid size: " << nx << " x " << ny << std::endl; - std::cout << "Elevation DOFs: " << nx * ny << std::endl; - std::cout << "Velocity DOFs: " << (nx + 1) * ny + nx * (ny + 1) - << std::endl; - std::cout << "Total DOFs: " << nx * ny + (nx + 1) * ny + nx * (ny + 1); - std::cout << std::endl; - } - - // compute time step - double t_end = 1.0; - double t_export = 0.02; - - double c = std::sqrt(g * h); - double alpha = 0.5; - double dt = alpha * std::min(grid.dx, grid.dy) / c; - dt = t_export / static_cast(ceil(t_export / dt)); - std::size_t nt = static_cast(ceil(t_end / dt)); - if (benchmark_mode) { - nt = 100; - dt = 1e-5; - t_export = 25 * dt; - t_end = nt * dt; - } - if (comm_rank == 0) { - std::cout << "Time step: " << dt << " s" << std::endl; - std::cout << "Total run time: " << std::fixed << std::setprecision(1); - std::cout << t_end << " s, "; - std::cout << nt << " time steps" << std::endl; - } - - // state variables - // water elevation at T points - Array e({nx + 1, ny}, dist); - dr::mp::fill(e, 0.0); - // x velocity at U points - Array u({nx + 1, ny}, dist); - dr::mp::fill(u, 0.0); - // y velocity at V points - Array v({nx + 1, ny + 1}, dist); - dr::mp::fill(v, 0.0); - - // state for RK stages - Array e1({nx + 1, ny}, dist); - Array u1({nx + 1, ny}, dist); - Array v1({nx + 1, ny + 1}, dist); - Array e2({nx + 1, ny}, dist); - Array u2({nx + 1, ny}, dist); - Array v2({nx + 1, ny + 1}, dist); - - // time tendencies - // NOTE not needed if rhs kernels are fused with RK stage assignment - Array dedt({nx + 1, ny}, dist); - Array dudt({nx + 1, ny}, dist); - Array dvdt({nx + 1, ny + 1}, dist); - - dr::mp::fill(dedt, 0); - dr::mp::fill(dudt, 0); - dr::mp::fill(dvdt, 0); - dr::mp::halo(dedt).exchange(); - dr::mp::halo(dudt).exchange(); - dr::mp::halo(dvdt).exchange(); - - auto init_op = [xmin, ymin, grid](auto index, auto v) { - auto &[o] = v; - - std::size_t global_i = index[0]; - if (global_i > 0) { - std::size_t global_j = index[1]; - T x = xmin + grid.dx / 2 + (global_i - 1) * grid.dx; - T y = ymin + grid.dy / 2 + global_j * grid.dy; - o = initial_elev(x, y, grid.lx, grid.ly); - } - }; - dr::mp::for_each(init_op, e); - - auto add = [](auto ops) { return ops.first + ops.second; }; - auto max = [](double x, double y) { return std::max(x, y); }; - auto rk_update2 = [](auto ops) { - return 0.75 * std::get<0>(ops) + - 0.25 * (std::get<1>(ops) + std::get<2>(ops)); - }; - auto rk_update3 = [](auto ops) { - return 1.0 / 3.0 * std::get<0>(ops) + - 2.0 / 3.0 * (std::get<1>(ops) + std::get<2>(ops)); - }; - - std::size_t i_export = 0; - double next_t_export = 0.0; - double t = 0.0; - double initial_v = 0.0; - auto tic = std::chrono::steady_clock::now(); -#ifdef DEBUG - nt = 5; -#endif - for (std::size_t i = 0; i < nt + 1; i++) { - t = i * dt; - - if (t >= next_t_export - 1e-8) { - - double elev_max = dr::mp::reduce(e, static_cast(0), max); - double u_max = dr::mp::reduce(u, static_cast(0), max); - - double total_v = (dr::mp::reduce(e, static_cast(0), std::plus{}) + h) * - grid.dx * grid.dy; - if (i == 0) { - initial_v = total_v; - } - double diff_v = total_v - initial_v; - - if (comm_rank == 0) { - printf("%2lu %4lu %.3f ", i_export, i, t); - printf("elev=%7.5f ", elev_max); - printf("u=%7.5f ", u_max); - printf("dV=% 6.3e ", diff_v); - printf("\n"); - } - if (elev_max > 1e3) { - if (comm_rank == 0) { - std::cout << "Invalid elevation value: " << elev_max << std::endl; - } - return 1; - } - i_export += 1; - next_t_export = i_export * t_export; - } - - // step - iter_callback(); - if (fused_kernels) { - stage1(u, v, e, u1, v1, e1, g, h, grid.dx_inv, grid.dy_inv, dt); - stage2(u, v, e, u1, v1, e1, u2, v2, e2, g, h, grid.dx_inv, grid.dy_inv, - dt); - stage3(u, v, e, u2, v2, e2, g, h, grid.dx_inv, grid.dy_inv, dt); - } else { - if (i % 2 == 1) { -#ifdef DEBUG - std::cout << "no communication\n"; -#endif - // First phase without communication - - // RK stage 1: u1 = u + dt*rhs(u) - rhs(u, v, e, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt); - dr::mp::transform(dr::mp::views::zip(u, dudt), u1.begin(), add); - dr::mp::transform(dr::mp::views::zip(v, dvdt), v1.begin(), add); - dr::mp::transform(dr::mp::views::zip(e, dedt), e1.begin(), add); - - // RK stage 2: u2 = 0.75*u + 0.25*(u1 + dt*rhs(u1)) - rhs(u1, v1, e1, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt); - dr::mp::transform(dr::mp::views::zip(u, u1, dudt), u2.begin(), rk_update2); - dr::mp::transform(dr::mp::views::zip(v, v1, dvdt), v2.begin(), rk_update2); - dr::mp::transform(dr::mp::views::zip(e, e1, dedt), e2.begin(), rk_update2); - - // RK stage 3: u3 = 1/3*u + 2/3*(u2 + dt*rhs(u2)) - rhs(u2, v2, e2, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt); - dr::mp::transform(dr::mp::views::zip(u, u2, dudt), u.begin(), rk_update3); - dr::mp::transform(dr::mp::views::zip(v, v2, dvdt), v.begin(), rk_update3); - dr::mp::transform(dr::mp::views::zip(e, e2, dedt), e.begin(), rk_update3); - } else { -#ifdef DEBUG - std::cout << "exchanges\n"; -#endif - // Second phase with communication - dr::mp::halo(e).exchange(); - dr::mp::halo(u).exchange(); - dr::mp::halo(v).exchange(); - - // RK stage 1: u1 = u + dt*rhs(u) - rhs(u, v, e, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt); - dr::mp::transform(dr::mp::views::zip(u, dudt), u1.begin(), add); - dr::mp::transform(dr::mp::views::zip(v, dvdt), v1.begin(), add); - dr::mp::transform(dr::mp::views::zip(e, dedt), e1.begin(), add); - - dr::mp::halo(u1).exchange(); - dr::mp::halo(v1).exchange(); - dr::mp::halo(e1).exchange(); - - // RK stage 2: u2 = 0.75*u + 0.25*(u1 + dt*rhs(u1)) - rhs(u1, v1, e1, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt); - dr::mp::transform(dr::mp::views::zip(u, u1, dudt),u2.begin(), rk_update2); - dr::mp::transform(dr::mp::views::zip(v, v1, dvdt),v2.begin(), rk_update2); - dr::mp::transform(dr::mp::views::zip(e, e1, dedt),e2.begin(), rk_update2); - - dr::mp::halo(u2).exchange(); - dr::mp::halo(v2).exchange(); - dr::mp::halo(e2).exchange(); - - // RK stage 3: u3 = 1/3*u + 2/3*(u2 + dt*rhs(u2)) - rhs(u2, v2, e2, dudt, dvdt, dedt, g, h, grid.dx_inv, grid.dy_inv, dt); - dr::mp::transform(dr::mp::views::zip(u, u2, dudt),u.begin(), rk_update3); - dr::mp::transform(dr::mp::views::zip(v, v2, dvdt),v.begin(), rk_update3); - dr::mp::transform(dr::mp::views::zip(e, e2, dedt),e.begin(), rk_update3); - } -#ifdef DEBUG - std::cout << "Iter " << i << "\n"; - debug_print_arr(nx + 1, ny, e, "e"); - debug_print_arr(nx + 1, ny, u, "u"); - debug_print_arr(nx + 1, ny + 1, v, "v"); - debug_print_arr(nx + 1, ny, e1, "e1"); - debug_print_arr(nx + 1, ny, u1, "u1"); - debug_print_arr(nx + 1, ny + 1, v1, "v1"); - debug_print_arr(nx + 1, ny, e2, "e2"); - debug_print_arr(nx + 1, ny, u2, "u2"); - debug_print_arr(nx + 1, ny + 1, v2, "v2"); - debug_print_arr(nx + 1, ny, dedt, "dedt"); - debug_print_arr(nx + 1, ny, dudt, "dudt"); - debug_print_arr(nx + 1, ny + 1, dvdt, "dvdt"); -#endif - } - } - - dr::mp::halo(e).exchange(); - dr::mp::halo(u).exchange(); - dr::mp::halo(v).exchange(); - dr::mp::halo(u1).exchange(); - dr::mp::halo(v1).exchange(); - dr::mp::halo(e1).exchange(); - dr::mp::halo(u2).exchange(); - dr::mp::halo(v2).exchange(); - dr::mp::halo(e2).exchange(); - - - auto toc = std::chrono::steady_clock::now(); - std::chrono::duration duration = toc - tic; - if (comm_rank == 0) { - double t_cpu = duration.count(); - double t_step = t_cpu / nt; - double read_bw = double(nread) / t_step / (1024 * 1024 * 1024); - double write_bw = double(nwrite) / t_step / (1024 * 1024 * 1024); - double flop_rate = double(nflop) / t_step / (1000 * 1000 * 1000); - double ai = double(nflop) / double(nread + nwrite); - std::cout << "Duration: " << std::setprecision(3) << t_cpu; - std::cout << " s" << std::endl; - std::cout << "Time per step: " << std::setprecision(2) << t_step * 1000; - std::cout << " ms" << std::endl; - std::cout << "Reads : " << std::setprecision(3) << read_bw; - std::cout << " GB/s" << std::endl; - std::cout << "Writes: " << std::setprecision(3) << write_bw; - std::cout << " GB/s" << std::endl; - std::cout << "FLOP/s: " << std::setprecision(3) << flop_rate; - std::cout << " GFLOP/s" << std::endl; - std::cout << "Arithmetic intensity: " << std::setprecision(5) << ai; - std::cout << " FLOP/Byte" << std::endl; - } - - // Compute error against exact solution - Array e_exact({nx + 1, ny}, dist); - dr::mp::fill(e_exact, 0.0); - Array error({nx + 1, ny}, dist); - - auto exact_op = [xmin, ymin, grid, t](auto index, auto v) { - auto &[o] = v; - - std::size_t global_i = index[0]; - if (global_i > 0) { - std::size_t global_j = index[1]; - T x = xmin + grid.dx / 2 + (global_i - 1) * grid.dx; - T y = ymin + grid.dy / 2 + global_j * grid.dy; - o = exact_elev(x, y, t, grid.lx, grid.ly); - } - }; - dr::mp::for_each(exact_op, e_exact); - dr::mp::halo(e_exact).exchange(); - auto error_kernel = [](auto ops) { - auto err = ops.first - ops.second; - return err * err; - }; - dr::mp::transform(dr::mp::views::zip(e, e_exact), error.begin(), - error_kernel); - double err_L2 = dr::mp::reduce(error, static_cast(0), std::plus{}) * - grid.dx * grid.dy / grid.lx / grid.ly; - err_L2 = std::sqrt(err_L2); - if (comm_rank == 0) { - std::cout << "L2 error: " << std::setw(7) << std::scientific; - std::cout << std::setprecision(5) << err_L2 << std::endl; - } - - if (benchmark_mode) { - return 0; - } - if (nx < 128 || ny < 128) { - if (comm_rank == 0) { - std::cout << "Skipping correctness test due to small problem size." - << std::endl; - } - } else if (nx == 128 && ny == 128) { - double expected_L2 = 0.007224068445111; - double rel_tolerance = 1e-6; - double rel_err = err_L2 / expected_L2 - 1.0; - if (!(fabs(rel_err) < rel_tolerance)) { - if (comm_rank == 0) { - std::cout << "ERROR: L2 error deviates from reference value: " - << expected_L2 << ", relative error: " << rel_err - << std::endl; - } - return 1; - } - } else { - double tolerance = 1e-2; - if (!(err_L2 < tolerance)) { - if (comm_rank == 0) { - std::cout << "ERROR: L2 error exceeds tolerance: " << err_L2 << " > " - << tolerance << std::endl; - } - return 1; - } - } - if (comm_rank == 0) { - std::cout << "SUCCESS" << std::endl; - } - - return 0; -} - -} // namespace WaveEquation - -#ifdef STANDALONE_BENCHMARK - -int main(int argc, char *argv[]) { - - MPI_Init(&argc, &argv); - comm = MPI_COMM_WORLD; - MPI_Comm_rank(comm, &comm_rank); - MPI_Comm_size(comm, &comm_size); - - cxxopts::Options options_spec(argv[0], "wave equation"); - // clang-format off - options_spec.add_options() -#ifndef DEBUG - ("n", "Grid size", cxxopts::value()->default_value("128")) -#else - ("n", "Grid size", cxxopts::value()->default_value("16")) -#endif - ("t,benchmark-mode", "Run a fixed number of time steps.", cxxopts::value()->default_value("false")) - ("sycl", "Execute on SYCL device") - ("l,log", "enable logging") - ("logprefix", "appended .RANK.log", cxxopts::value()->default_value("dr")) - ("f,fused-kernel", "Use fused kernels.", cxxopts::value()->default_value("false")) - ("device-memory", "Use device memory") - ("h,help", "Print help"); - // clang-format on - - cxxopts::ParseResult options; - try { - options = options_spec.parse(argc, argv); - } catch (const cxxopts::OptionParseException &e) { - std::cout << options_spec.help() << "\n"; - exit(1); - } - - std::unique_ptr logfile; - if (options.count("log")) { - logfile.reset(new std::ofstream(options["logprefix"].as() + - fmt::format(".{}.log", comm_rank))); - dr::drlog.set_file(*logfile); - } - - if (options.count("sycl")) { -#ifdef SYCL_LANGUAGE_VERSION - sycl::queue q = dr::mp::select_queue(); - std::cout << "Run on: " - << q.get_device().get_info() << "\n"; - dr::mp::init(q, options.count("device-memory") ? sycl::usm::alloc::device - : sycl::usm::alloc::shared); -#else - std::cout << "Sycl support requires icpx\n"; - exit(1); -#endif - } else { - if (comm_rank == 0) { - std::cout << "Run on: CPU\n"; - } - dr::mp::init(); - } - - std::size_t n = options["n"].as(); - bool benchmark_mode = options["t"].as(); - bool fused_kernels = options["f"].as(); - - auto error = WaveEquation::run(n, benchmark_mode, fused_kernels); - dr::mp::finalize(); - MPI_Finalize(); - return error; -} - -#else - -static void WaveEquation_DR(benchmark::State &state) { - - int n = ::sqrtl(default_vector_size); - - // ugly hack to make it working in reasonable time in benchmarking framework - // drbench.py should specify right size or there should be another size option - // to use here instead of default_vector_size - n /= 4; - - std::size_t nread, nwrite, nflop; - WaveEquation::calculate_complexity(n, n, nread, nwrite, nflop); - Stats stats(state, nread, nwrite, nflop); - - auto iter_callback = [&stats]() { stats.rep(); }; - for (auto _ : state) { - WaveEquation::run(n, true, true, iter_callback); - } -} - -DR_BENCHMARK(WaveEquation_DR); - -#endif diff --git a/benchmarks/gbench/mp/wave_equation.cpp b/benchmarks/gbench/mp/wave_equation.cpp index 71ac12bba9..ee0a5b9799 100644 --- a/benchmarks/gbench/mp/wave_equation.cpp +++ b/benchmarks/gbench/mp/wave_equation.cpp @@ -71,8 +71,6 @@ double initial_elev(double x, double y, double lx, double ly) { return exact_elev(x, y, 0.0, lx, ly); } -//#define DEBUG - void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, double g, double h, double dx_inv, double dy_inv, double dt) { /** @@ -84,9 +82,6 @@ void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, out(0, 0) = -dt * g * (in(1, 0) - in(0, 0)) * dx_inv; }; { -#ifdef DEBUG - std::cout << "stage1\n"; -#endif std::array start{1, 0}; std::array end{e.extent(0) - 1, e.extent(1)}; auto e_view = dr::mp::views::submdspan(e.view(), start, end); @@ -99,9 +94,6 @@ void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, out(0, 0) = -dt * g * (in(0, 0) - in(0, -1)) * dy_inv; }; { -#ifdef DEBUG - std::cout << "stage2\n"; -#endif std::array start{0, 1}; std::array end{e.extent(0), e.extent(1)}; auto e_view = dr::mp::views::submdspan(e.view(), start, end); @@ -118,9 +110,6 @@ void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, out(0, 0) = -dt * h * (dudx + dvdy); }; { -#ifdef DEBUG - std::cout << "stage3\n"; -#endif std::array start{1, 0}; std::array end{u.extent(0), u.extent(1)}; auto u_view = dr::mp::views::submdspan(u.view(), start, end); @@ -128,9 +117,6 @@ void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, auto dedt_view = dr::mp::views::submdspan(dedt.view(), start, end); dr::mp::stencil_for_each(rhs_div, u_view, v_view, dedt_view); } -#ifdef DEBUG - std::cout << "after\n"; -#endif }; void stage1(Array &u, Array &v, Array &e, Array &u1, Array &v1, Array &e1, @@ -327,13 +313,6 @@ void stage3(Array &u, Array &v, Array &e, Array &u2, Array &v2, Array &e2, dr::mp::halo(e).exchange_begin(); }; -#ifdef DEBUG - void debug_print_arr(std::size_t n, std::size_t m, const Array& arr, const std::string& str) { - std::cout << "Array " << str << ":\n"; - std::cout << arr << "\n"; -} -#endif - int run( int n, bool benchmark_mode, bool fused_kernels, std::function iter_callback = []() {}) { @@ -452,9 +431,6 @@ int run( double t = 0.0; double initial_v = 0.0; auto tic = std::chrono::steady_clock::now(); -#ifdef DEBUG - nt = 5; -#endif for (std::size_t i = 0; i < nt + 1; i++) { t = i * dt; @@ -524,21 +500,6 @@ int run( dr::mp::halo(v).exchange_begin(); dr::mp::transform(dr::mp::views::zip(e, e2, dedt), e.begin(), rk_update3); dr::mp::halo(e).exchange_begin(); -#ifdef DEBUG - std::cout << "Iter " << i << "\n"; - debug_print_arr(nx + 1, ny, e, "e"); - debug_print_arr(nx + 1, ny, u, "u"); - debug_print_arr(nx + 1, ny + 1, v, "v"); - debug_print_arr(nx + 1, ny, e1, "e1"); - debug_print_arr(nx + 1, ny, u1, "u1"); - debug_print_arr(nx + 1, ny + 1, v1, "v1"); - debug_print_arr(nx + 1, ny, e2, "e2"); - debug_print_arr(nx + 1, ny, u2, "u2"); - debug_print_arr(nx + 1, ny + 1, v2, "v2"); - debug_print_arr(nx + 1, ny, dedt, "dedt"); - debug_print_arr(nx + 1, ny, dudt, "dudt"); - debug_print_arr(nx + 1, ny + 1, dvdt, "dvdt"); -#endif } } dr::mp::halo(u).exchange_finalize(); @@ -650,11 +611,7 @@ int main(int argc, char *argv[]) { cxxopts::Options options_spec(argv[0], "wave equation"); // clang-format off options_spec.add_options() -#ifndef DEBUG ("n", "Grid size", cxxopts::value()->default_value("128")) -#else - ("n", "Grid size", cxxopts::value()->default_value("16")) -#endif ("t,benchmark-mode", "Run a fixed number of time steps.", cxxopts::value()->default_value("false")) ("sycl", "Execute on SYCL device") ("l,log", "enable logging") diff --git a/benchmarks/gbench/mp/wave_equation_wide.cpp b/benchmarks/gbench/mp/wave_equation_wide.cpp new file mode 100644 index 0000000000..e411e4fcc8 --- /dev/null +++ b/benchmarks/gbench/mp/wave_equation_wide.cpp @@ -0,0 +1,442 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "cxxopts.hpp" +#include "dr/mp.hpp" +#include "mpi.h" +#include "wave_utils.hpp" +#include +#include +#include + +#ifdef STANDALONE_BENCHMARK + +MPI_Comm comm; +int comm_rank; +int comm_size; + +#else + +#include "../common/dr_bench.hpp" + +#endif + +namespace WaveEquation { + +using T = double; +using Array = dr::mp::distributed_mdarray; + +// gravitational acceleration +constexpr double g = 9.81; +// water depth +constexpr double h = 1.0; + +// Get number of read/write bytes and flops for a single time step +// These numbers correspond to the fused kernel version +void calculate_complexity(std::size_t nx, std::size_t ny, std::size_t &nread, + std::size_t &nwrite, std::size_t &nflop) { + // stage1: 2+2+3 = 7 + // stage2: 3+3+4 = 10 + // stage3: 3+3+4 = 10 + nread = (27 * nx * ny) * sizeof(T); + // stage1: 3 + // stage2: 3 + // stage3: 3 + nwrite = (9 * nx * ny) * sizeof(T); + // stage1: 3+3+6 = 12 + // stage2: 6+6+9 = 21 + // stage3: 6+6+9 = 21 + nflop = 54 * nx * ny; +} + +double exact_elev(double x, double y, double t, double lx, double ly) { + /** + * Exact solution for elevation field. + * + * Returns time-dependent elevation of a 2D standing wave in a + * rectangular domain. + */ + double amp = 0.5; + double c = std::sqrt(g * h); + double sol_x = std::cos(2.0 * M_PI * x / lx); + double sol_y = std::cos(2.0 * M_PI * y / ly); + double omega = c * M_PI * std::hypot(1.0 / lx, 1.0 / ly); + double sol_t = std::cos(2.0 * omega * t); + return amp * sol_x * sol_y * sol_t; +} + +double initial_elev(double x, double y, double lx, double ly) { + return exact_elev(x, y, 0.0, lx, ly); +} + +void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, double dx_inv, double dy_inv, double dt) { + /** + * Evaluate right hand side of the equations + */ + auto rhs_dedx = [dt, dx_inv](auto v) { + auto [in, out] = v; + out(0, 0) = -dt * g * (in(1, 0) - in(0, 0)) * dx_inv; + }; + stencil_for_each_extended<2>(rhs_dedx, {0, 0}, {1, 0}, e, dudt); + + auto rhs_dedy = [dt, dy_inv](auto v) { + auto [in, out] = v; + out(0, 0) = -dt * g * (in(0, 0) - in(0, -1)) * dy_inv; + }; + stencil_for_each_extended<2>(rhs_dedy, {0, 1}, {0, 0}, e, dvdt); + + auto rhs_div = [dt, dx_inv, dy_inv](auto args) { + auto [u, v, out] = args; + auto dudx = (u(0, 0) - u(-1, 0)) * dx_inv; + auto dvdy = (v(0, 1) - v(0, 0)) * dy_inv; + out(0, 0) = -dt * h * (dudx + dvdy); + }; + stencil_for_each_extended<2>(rhs_div, {1, 0}, {0, 0}, u, v, dedt); +} + +int run(std::size_t n, std::size_t redundancy, std::size_t steps, std::function iter_callback = []() {}) { + // construct grid + // number of cells in x, y direction + std::size_t nx = n; + std::size_t ny = n; + const double xmin = -1, xmax = 1; + const double ymin = -1, ymax = 1; + ArakawaCGrid grid(xmin, xmax, ymin, ymax, nx, ny); + + auto dist = dr::mp::distribution() + .halo(1) + .redundancy(redundancy); + + // statistics + std::size_t nread, nwrite, nflop; + calculate_complexity(nx, ny, nread, nwrite, nflop); + + if (comm_rank == 0) { + std::cout << "Using backend: dr" << std::endl; + std::cout << "Grid size: " << nx << " x " << ny << std::endl; + std::cout << "Elevation DOFs: " << nx * ny << std::endl; + std::cout << "Velocity DOFs: " << (nx + 1) * ny + nx * (ny + 1) + << std::endl; + std::cout << "Total DOFs: " << nx * ny + (nx + 1) * ny + nx * (ny + 1); + std::cout << std::endl; + } + + // compute time step + double c = std::sqrt(g * h); + double alpha = 0.5; + double dt = alpha * std::min(grid.dx, grid.dy) / c; + std::size_t nt = steps; + dt = 1e-5; + double t_end = static_cast(nt) * dt; + double t_export = 25 * dt; + + if (comm_rank == 0) { + std::cout << "Time step: " << dt << " s" << std::endl; + std::cout << "Total run time: " << std::fixed << std::setprecision(1); + std::cout << t_end << " s, "; + std::cout << nt << " time steps" << std::endl; + std::cout << "Redundancy " << redundancy << std::endl; + } + + // state variables + // water elevation at T points + Array e({nx + 1, ny}, dist); + dr::mp::fill(e, 0.0); + // x velocity at U points + Array u({nx + 1, ny}, dist); + dr::mp::fill(u, 0.0); + // y velocity at V points + Array v({nx + 1, ny + 1}, dist); + dr::mp::fill(v, 0.0); + + // state for RK stages + Array e1({nx + 1, ny}, dist); + Array u1({nx + 1, ny}, dist); + Array v1({nx + 1, ny + 1}, dist); + Array e2({nx + 1, ny}, dist); + Array u2({nx + 1, ny}, dist); + Array v2({nx + 1, ny + 1}, dist); + + // time tendencies + // NOTE not needed if rhs kernels are fused with RK stage assignment + Array dedt({nx + 1, ny}, dist); + Array dudt({nx + 1, ny}, dist); + Array dvdt({nx + 1, ny + 1}, dist); + + dr::mp::fill(dedt, 0); + dr::mp::fill(dudt, 0); + dr::mp::fill(dvdt, 0); + dr::mp::halo(dedt).exchange(); + dr::mp::halo(dudt).exchange(); + dr::mp::halo(dvdt).exchange(); + + auto init_op = [xmin, ymin, grid](auto index, auto v) { + auto &[o] = v; + + std::size_t global_i = index[0]; + if (global_i > 0) { + std::size_t global_j = index[1]; + T x = xmin + grid.dx / 2 + static_cast(global_i - 1) * grid.dx; + T y = ymin + grid.dy / 2 + static_cast(global_j) * grid.dy; + o = initial_elev(x, y, grid.lx, grid.ly); + } + }; + dr::mp::for_each(init_op, e); + + auto add = [](auto ops) { return ops.first + ops.second; }; + auto max = [](double x, double y) { return std::max(x, y); }; + auto rk_update2 = [](auto ops) { + return 0.75 * std::get<0>(ops) + + 0.25 * (std::get<1>(ops) + std::get<2>(ops)); + }; + auto rk_update3 = [](auto ops) { + return 1.0 / 3.0 * std::get<0>(ops) + + 2.0 / 3.0 * (std::get<1>(ops) + std::get<2>(ops)); + }; + + std::size_t i_export = 0; + double next_t_export = 0.0; + double t = 0.0; + double initial_v = 0.0; + auto tic = std::chrono::steady_clock::now(); + + // RK stage 1: u1 = u + dt*rhs(u) + auto stage_1 = [&] { + rhs(u, v, e, dudt, dvdt, dedt, grid.dx_inv, grid.dy_inv, dt); + dr::mp::transform(dr::mp::views::zip(u, dudt), u1.begin(), add); + dr::mp::transform(dr::mp::views::zip(v, dvdt), v1.begin(), add); + dr::mp::transform(dr::mp::views::zip(e, dedt), e1.begin(), add); + }; + // RK stage 2: u2 = 0.75*u + 0.25*(u1 + dt*rhs(u1)) + auto stage_2 = [&] { + rhs(u1, v1, e1, dudt, dvdt, dedt, grid.dx_inv, grid.dy_inv, dt); + dr::mp::transform(dr::mp::views::zip(u, u1, dudt), u2.begin(), rk_update2); + dr::mp::transform(dr::mp::views::zip(v, v1, dvdt), v2.begin(), rk_update2); + dr::mp::transform(dr::mp::views::zip(e, e1, dedt), e2.begin(), rk_update2); + }; + // RK stage 3: u3 = 1/3*u + 2/3*(u2 + dt*rhs(u2)) + auto stage_3 = [&] { + rhs(u2, v2, e2, dudt, dvdt, dedt, grid.dx_inv, grid.dy_inv, dt); + dr::mp::transform(dr::mp::views::zip(u, u2, dudt), u.begin(), rk_update3); + dr::mp::transform(dr::mp::views::zip(v, v2, dvdt), v.begin(), rk_update3); + dr::mp::transform(dr::mp::views::zip(e, e2, dedt), e.begin(), rk_update3); + }; + + for (std::size_t i = 0; i < nt + 1; i++) { + t = static_cast(i) * dt; + + if (t >= next_t_export - 1e-8) { + + double elev_max = dr::mp::reduce(e, static_cast(0), max); + double u_max = dr::mp::reduce(u, static_cast(0), max); + + double total_v = (dr::mp::reduce(e, static_cast(0), std::plus{}) + h) * + grid.dx * grid.dy; + if (i == 0) { + initial_v = total_v; + } + double diff_v = total_v - initial_v; + + if (comm_rank == 0) { + printf("%2lu %4lu %.3f ", i_export, i, t); + printf("elev=%7.5f ", elev_max); + printf("u=%7.5f ", u_max); + printf("dV=% 6.3e ", diff_v); + printf("\n"); + } + if (elev_max > 1e3) { + if (comm_rank == 0) { + std::cout << "Invalid elevation value: " << elev_max << std::endl; + } + return 1; + } + i_export += 1; + next_t_export = static_cast(i_export) * t_export; + } + + // step + iter_callback(); + if ((i + 1) % redundancy == 0){ + // phase with communication - once after (redundancy - 1) steps without communication + dr::mp::halo(e).exchange(); + dr::mp::halo(u).exchange(); + dr::mp::halo(v).exchange(); + stage_1(); + + dr::mp::halo(u1).exchange(); + dr::mp::halo(v1).exchange(); + dr::mp::halo(e1).exchange(); + stage_2(); + + dr::mp::halo(u2).exchange(); + dr::mp::halo(v2).exchange(); + dr::mp::halo(e2).exchange(); + stage_3(); + } else { + // Phase without communication + stage_1(); + stage_2(); + stage_3(); + } + } + + dr::mp::halo(e).exchange(); + dr::mp::halo(u).exchange(); + dr::mp::halo(v).exchange(); + dr::mp::halo(u1).exchange(); + dr::mp::halo(v1).exchange(); + dr::mp::halo(e1).exchange(); + dr::mp::halo(u2).exchange(); + dr::mp::halo(v2).exchange(); + dr::mp::halo(e2).exchange(); + + + auto toc = std::chrono::steady_clock::now(); + std::chrono::duration duration = toc - tic; + if (comm_rank == 0) { + double t_cpu = duration.count(); + double t_step = t_cpu / static_cast(nt); + double read_bw = double(nread) / t_step / (1024 * 1024 * 1024); + double write_bw = double(nwrite) / t_step / (1024 * 1024 * 1024); + double flop_rate = double(nflop) / t_step / (1000 * 1000 * 1000); + double ai = double(nflop) / double(nread + nwrite); + std::cout << "Duration: " << std::setprecision(3) << t_cpu; + std::cout << " s" << std::endl; + std::cout << "Time per step: " << std::setprecision(2) << t_step * 1000; + std::cout << " ms" << std::endl; + std::cout << "Reads : " << std::setprecision(3) << read_bw; + std::cout << " GB/s" << std::endl; + std::cout << "Writes: " << std::setprecision(3) << write_bw; + std::cout << " GB/s" << std::endl; + std::cout << "FLOP/s: " << std::setprecision(3) << flop_rate; + std::cout << " GFLOP/s" << std::endl; + std::cout << "Arithmetic intensity: " << std::setprecision(5) << ai; + std::cout << " FLOP/Byte" << std::endl; + } + + // Compute error against exact solution + Array e_exact({nx + 1, ny}, dist); + dr::mp::fill(e_exact, 0.0); + Array error({nx + 1, ny}, dist); + + auto exact_op = [xmin, ymin, grid, t](auto index, auto v) { + auto &[o] = v; + + std::size_t global_i = index[0]; + if (global_i > 0) { + std::size_t global_j = index[1]; + T x = xmin + grid.dx / 2 + static_cast(global_i - 1) * grid.dx; + T y = ymin + grid.dy / 2 + static_cast(global_j) * grid.dy; + o = exact_elev(x, y, t, grid.lx, grid.ly); + } + }; + dr::mp::for_each(exact_op, e_exact); + dr::mp::halo(e_exact).exchange(); + auto error_kernel = [](auto ops) { + auto err = ops.first - ops.second; + return err * err; + }; + dr::mp::transform(dr::mp::views::zip(e, e_exact), error.begin(), + error_kernel); + double err_L2 = dr::mp::reduce(error, static_cast(0), std::plus{}) * + grid.dx * grid.dy / grid.lx / grid.ly; + err_L2 = std::sqrt(err_L2); + if (comm_rank == 0) { + std::cout << "L2 error: " << std::setw(7) << std::scientific; + std::cout << std::setprecision(5) << err_L2 << std::endl; + } + return 0; +} + +} // namespace WaveEquation + +#ifdef STANDALONE_BENCHMARK + +int main(int argc, char *argv[]) { + + MPI_Init(&argc, &argv); + comm = MPI_COMM_WORLD; + MPI_Comm_rank(comm, &comm_rank); + MPI_Comm_size(comm, &comm_size); + + cxxopts::Options options_spec(argv[0], "wave equation"); + // clang-format off + options_spec.add_options() + ("n", "Grid size", cxxopts::value()->default_value("128")) + ("t,steps", "Run a fixed number of time steps.", cxxopts::value()->default_value("100")) + ("r,redundancy", "Set outer-grid redundancy parameter.", cxxopts::value()->default_value("2")) + ("sycl", "Execute on SYCL device") + ("l,log", "enable logging") + ("logprefix", "appended .RANK.log", cxxopts::value()->default_value("dr")) + ("device-memory", "Use device memory") + ("h,help", "Print help"); + // clang-format on + + cxxopts::ParseResult options; + try { + options = options_spec.parse(argc, argv); + } catch (const cxxopts::OptionParseException &e) { + std::cout << options_spec.help() << "\n"; + exit(1); + } + + std::unique_ptr logfile; + if (options.count("log")) { + logfile = std::make_unique(options["logprefix"].as() + + fmt::format(".{}.log", comm_rank)); + dr::drlog.set_file(*logfile); + } + + if (options.count("sycl")) { +#ifdef SYCL_LANGUAGE_VERSION + sycl::queue q = dr::mp::select_queue(); + std::cout << "Run on: " + << q.get_device().get_info() << "\n"; + dr::mp::init(q, options.count("device-memory") ? sycl::usm::alloc::device + : sycl::usm::alloc::shared); +#else + std::cout << "Sycl support requires icpx\n"; + exit(1); +#endif + } else { + if (comm_rank == 0) { + std::cout << "Run on: CPU\n"; + } + dr::mp::init(); + } + + std::size_t n = options["n"].as(); + std::size_t redundancy = options["r"].as(); + std::size_t steps = options["t"].as(); + + auto error = WaveEquation::run(n, redundancy, steps); + dr::mp::finalize(); + MPI_Finalize(); + return error; +} + +#else + +static void WaveEquation_DR(benchmark::State &state) { + + int n = ::sqrtl(default_vector_size); + + // ugly hack to make it working in reasonable time in benchmarking framework + // drbench.py should specify right size or there should be another size option + // to use here instead of default_vector_size + n /= 4; + + std::size_t nread, nwrite, nflop; + WaveEquation::calculate_complexity(n, n, nread, nwrite, nflop); + Stats stats(state, nread, nwrite, nflop); + + auto iter_callback = [&stats]() { stats.rep(); }; + for (auto _ : state) { + WaveEquation::run(n, true, true, iter_callback); + } +} + +DR_BENCHMARK(WaveEquation_DR); + +#endif diff --git a/include/dr/mp/algorithms/for_each.hpp b/include/dr/mp/algorithms/for_each.hpp index 62f54b88d6..bc4b981385 100644 --- a/include/dr/mp/algorithms/for_each.hpp +++ b/include/dr/mp/algorithms/for_each.hpp @@ -95,8 +95,6 @@ namespace __detail { op(stencils); }; if (mp::use_sycl()) { - dr::drlog.debug(" using sycl\n"); - #ifdef SYCL_LANGUAGE_VERSION dr::__detail::parallel_for( dr::mp::sycl_queue(), sycl::range<1>(distance[0]), @@ -106,7 +104,6 @@ namespace __detail { assert(false); #endif } else { - dr::drlog.debug(" using cpu\n"); for (std::size_t i = 0; i < distance[0]; i++) { do_point(i); } @@ -148,38 +145,28 @@ namespace __detail { }); op(stencils); }; -// if (mp::use_sycl()) { -// -//#ifdef SYCL_LANGUAGE_VERSION -// dr::__detail::parallel_for( -// dr::mp::sycl_queue(), sycl::range<2>(distance[0], distance[1]), -// do_point) -// .wait(); -//#else -// assert(false); -//#endif -// } else { + if (mp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + dr::__detail::parallel_for( + dr::mp::sycl_queue(), sycl::range<2>(distance[0], distance[1]), + do_point) + .wait(); +#else + assert(false); +#endif + } else { for (std::size_t i = 0; i < distance[0]; i++) { for (std::size_t j = 0; j < distance[1]; j++) { - auto seg0 = std::get<0>(segs); - auto origin0 = seg0.origin(); - auto begin0 = seg0_begin; - std::cout << origin0[0] + i + begin0[0] << " " << origin0[1] + j + begin0[1] << "\n"; -// auto seg1 = std::get<1>(segs); -// auto origin1 = seg1.origin(); -// auto begin1 = seg1.begin_stencil(begin); -// std::cout << "snd " << origin1[0] + i + begin1[0] << " " << origin1[1] + j + begin1[1] << "\n"; do_point(stencil_index_type<2>{i, j}); } } -// } + } } } template requires (1 <= Rank && Rank <= 3) void stencil_for_each_extended(auto op, __detail::stencil_index_type begin, __detail::stencil_index_type end, dr::distributed_range auto &&...drs) { - dr::drlog.debug(dr::logger::for_each, "for_each_extended: parallel execution\n"); auto ranges = std::tie(drs...); auto &&dr0 = std::get<0>(ranges); if (rng::empty(dr0)) { @@ -194,7 +181,12 @@ void stencil_for_each_extended(auto op, __detail::stencil_index_type begin else if constexpr (Rank == 2) { __detail::stencil_for_each_extended_2(op, begin, end, segs); } - else if constexpr (Rank == 3) {} + else if constexpr (Rank == 3) { + static_assert(false, "Not implemented"); + } + else { + static_assert(false, "Not supported"); // sycl for_each does not support more than 3 dimensions + } } barrier(); } diff --git a/include/dr/mp/algorithms/md_for_each.hpp b/include/dr/mp/algorithms/md_for_each.hpp index 5e145f91c7..9d92f0fe98 100644 --- a/include/dr/mp/algorithms/md_for_each.hpp +++ b/include/dr/mp/algorithms/md_for_each.hpp @@ -103,7 +103,6 @@ void stencil_for_each(auto op, is_mdspan_view auto &&...drs) { #else for (std::size_t i = 0; i < mdspan0.extents().extent(0); i++) { for (std::size_t j = 0; j < mdspan0.extents().extent(1); j++) { -// std::cout << seg0.origin()[0] + i << " " << seg0.origin()[1] + j << "\n"; invoke_index(std::array{i, j}); } } diff --git a/include/dr/mp/views/mdspan_view.hpp b/include/dr/mp/views/mdspan_view.hpp index 568d9b6c5f..3ffd58dc71 100644 --- a/include/dr/mp/views/mdspan_view.hpp +++ b/include/dr/mp/views/mdspan_view.hpp @@ -90,7 +90,6 @@ class md_segment : public rng::view_interface> { } static auto local_tile_extended(BaseSegment segment, const index_type &tile_shape) { - // Undefined behavior if the segments is not local T *ptr = std::to_address(dr::ranges::local(rng::begin(segment))); return md::mdspan(ptr, tile_shape); } diff --git a/test/gtest/mp/wide-halo-2d-3.cpp b/test/gtest/mp/wide-halo-2d-3.cpp index 99ac5304ca..e5d7b90759 100644 --- a/test/gtest/mp/wide-halo-2d-3.cpp +++ b/test/gtest/mp/wide-halo-2d-3.cpp @@ -18,13 +18,9 @@ dr::mp::distribution get_distribution() { .redundancy(redundancy); } -//int& get(Array& v, std::size_t i, std::size_t j) { -// return *(v.begin() + i * 6 + j).local(); -//} -// -//const int& get(const Array& v, std::size_t i, std::size_t j) { -// return *(v.begin() + i * 6 + j).local(); -//} +int& get(Array& v, std::size_t i, std::size_t j) { + return *(v.begin() + i * size[0] + j).local(); +} TEST(WideHalo3, suite_works_for_3_processes_only) { EXPECT_EQ(dr::mp::default_comm().size(), 3); @@ -70,13 +66,113 @@ TEST(WideHalo3, halo2d_is_visible_after_exchange_not_earlier) { }; print("dv", dv); + transform(); print("dv", dv); + // after first step, only actually stored values and their neighbours are guaranteed to be correct + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 9); + EXPECT_EQ(get(dv, 2, 1), 9); + EXPECT_EQ(get(dv, 3, 1), 1); + break; + case 1: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 9); + EXPECT_EQ(get(dv, 2, 1), 9); + EXPECT_EQ(get(dv, 3, 1), 9); + EXPECT_EQ(get(dv, 4, 1), 9); + EXPECT_EQ(get(dv, 5, 1), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2, 1), 1); + EXPECT_EQ(get(dv, 3, 1), 9); + EXPECT_EQ(get(dv, 4, 1), 9); + EXPECT_EQ(get(dv, 5, 1), 1); + break; + } + transform(); print("dv", dv); + // after second step, only actually stored values are guaranteed to be correct + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 41); + EXPECT_EQ(get(dv, 3, 1), 1); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 57); + EXPECT_EQ(get(dv, 3, 2), 1); + break; + case 1: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2, 1), 1); + EXPECT_EQ(get(dv, 3, 1), 41); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 2, 2), 1); + EXPECT_EQ(get(dv, 3, 2), 57); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + } + dv.halo().exchange(); dv_out.halo().exchange(); print("dv", dv); + // after exchange all are correct + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + break; + case 1: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + } } TEST(WideHalo3, halo2d_api_works) { @@ -122,4 +218,41 @@ TEST(WideHalo3, halo2d_api_works) { }, dv, dv_out); print("dv", dv); + // after exchange all are correct + switch (dr::mp::default_comm().rank()) { + case 0: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + break; + case 1: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + } } From f269ac8810e40ad8ea3fcd43921210ece6c883c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ch=C4=99tkowski?= Date: Wed, 23 Oct 2024 18:23:02 +0200 Subject: [PATCH 06/19] Add support for 3d arrays --- include/dr/mp/algorithms/for_each.hpp | 57 ++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/include/dr/mp/algorithms/for_each.hpp b/include/dr/mp/algorithms/for_each.hpp index bc4b981385..ef624e7c3e 100644 --- a/include/dr/mp/algorithms/for_each.hpp +++ b/include/dr/mp/algorithms/for_each.hpp @@ -162,6 +162,61 @@ namespace __detail { } } } + + void stencil_for_each_extended_3(auto op, stencil_index_type<3>& begin, stencil_index_type<3> end, const auto& segs) { + auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); + + auto sub = [](auto a) { + auto x = std::get<0>(a); + auto y = std::get<1>(a); + return y > x ? y - x : 0; + }; + auto is_zero = [](auto a) { return a != 0; }; + + auto zipped = zip_view(seg0_begin, seg0_end); + auto distance = zipped | std::views::transform(sub); + + if ((distance | std::views::filter(is_zero)).empty()) + return; + + auto seg_infos = dr::__detail::tuple_transform(segs, [&begin](auto &&seg) { + auto ext = seg.root_mdspan().extents(); + auto begin_stencil = seg.begin_stencil(begin); + return std::make_pair( + md::mdspan( + std::to_address(&seg.mdspan_extended()(begin_stencil[0], begin_stencil[1], begin_stencil[2])), + ext + ), ext); + }); + + auto do_point = [seg_infos, op](auto index) { + auto stencils = + dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { + return md::mdspan( + std::to_address(&seg_info.first(index[0], index[1], index[2])), + seg_info.second); + }); + op(stencils); + }; + if (mp::use_sycl()) { +#ifdef SYCL_LANGUAGE_VERSION + dr::__detail::parallel_for( + dr::mp::sycl_queue(), sycl::range<3>(distance[0], distance[1], distance[2]), + do_point) + .wait(); +#else + assert(false); +#endif + } else { + for (std::size_t i = 0; i < distance[0]; i++) { + for (std::size_t j = 0; j < distance[1]; j++) { + for (std::size_t k = 0; k < distance[3]; k++) { + do_point(stencil_index_type<3>{i, j, k}); + } + } + } + } + } } template @@ -182,7 +237,7 @@ void stencil_for_each_extended(auto op, __detail::stencil_index_type begin __detail::stencil_for_each_extended_2(op, begin, end, segs); } else if constexpr (Rank == 3) { - static_assert(false, "Not implemented"); + __detail::stencil_for_each_extended_3(op, begin, end, segs); } else { static_assert(false, "Not supported"); // sycl for_each does not support more than 3 dimensions From 3c9ffb32d1fee8bd5c55ad7213f304ccb4b13e42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ch=C4=99tkowski?= Date: Tue, 5 Nov 2024 20:00:07 +0100 Subject: [PATCH 07/19] Style issues --- include/dr/mp/algorithms/for_each.hpp | 6 +++--- include/dr/mp/halo.hpp | 8 ++++---- include/dr/mp/halo/instance.hpp | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/dr/mp/algorithms/for_each.hpp b/include/dr/mp/algorithms/for_each.hpp index ef624e7c3e..2b1bafe1c7 100644 --- a/include/dr/mp/algorithms/for_each.hpp +++ b/include/dr/mp/algorithms/for_each.hpp @@ -77,7 +77,7 @@ namespace __detail { auto zipped = zip_view(seg0_begin, seg0_end); auto distance = zipped | std::views::transform(sub); - if ((distance | std::views::filter(is_zero)).empty()) + if (rng::empty(distance | std::views::filter(is_zero))) return; auto seg_infos = dr::__detail::tuple_transform(segs, [begin](auto &&seg) { @@ -123,7 +123,7 @@ namespace __detail { auto zipped = zip_view(seg0_begin, seg0_end); auto distance = zipped | std::views::transform(sub); - if ((distance | std::views::filter(is_zero)).empty()) + if (rng::empty(distance | std::views::filter(is_zero))) return; auto seg_infos = dr::__detail::tuple_transform(segs, [&begin](auto &&seg) { @@ -176,7 +176,7 @@ namespace __detail { auto zipped = zip_view(seg0_begin, seg0_end); auto distance = zipped | std::views::transform(sub); - if ((distance | std::views::filter(is_zero)).empty()) + if (rng::empty(distance | std::views::filter(is_zero))) return; auto seg_infos = dr::__detail::tuple_transform(segs, [&begin](auto &&seg) { diff --git a/include/dr/mp/halo.hpp b/include/dr/mp/halo.hpp index 53cabb853c..cdf884f169 100644 --- a/include/dr/mp/halo.hpp +++ b/include/dr/mp/halo.hpp @@ -4,7 +4,7 @@ #pragma once -#include "halo/halo.hpp" -#include "halo/group.hpp" -#include "halo/instance.hpp" -#include "halo/format.hpp" +#include +#include +#include +#include diff --git a/include/dr/mp/halo/instance.hpp b/include/dr/mp/halo/instance.hpp index 5fdb4555a1..4541962135 100644 --- a/include/dr/mp/halo/instance.hpp +++ b/include/dr/mp/halo/instance.hpp @@ -6,8 +6,8 @@ #include #include -#include "halo.hpp" -#include "group.hpp" +#include +#include namespace dr::mp { template From 80fa24e08c1e441bf2af0b8f9077720ded0f39b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ch=C4=99tkowski?= Date: Wed, 6 Nov 2024 18:39:23 +0100 Subject: [PATCH 08/19] Clang-format --- benchmarks/gbench/mp/wave_equation_wide.cpp | 22 +- include/dr/mp/algorithms/for_each.hpp | 287 ++++++++-------- include/dr/mp/algorithms/md_for_each.hpp | 4 +- .../dr/mp/containers/distributed_mdarray.hpp | 6 +- .../dr/mp/containers/distributed_vector.hpp | 14 +- include/dr/mp/containers/distribution.hpp | 3 +- include/dr/mp/containers/segment.hpp | 24 +- include/dr/mp/halo.hpp | 4 +- include/dr/mp/halo/halo.hpp | 317 ++++++++--------- include/dr/mp/halo/instance.hpp | 52 +-- include/dr/mp/views/mdspan_view.hpp | 68 ++-- test/gtest/mp/halo-3.cpp | 108 +++--- test/gtest/mp/wide-halo-1d-3.cpp | 225 ++++++------ test/gtest/mp/wide-halo-2d-3.cpp | 325 +++++++++--------- 14 files changed, 753 insertions(+), 706 deletions(-) diff --git a/benchmarks/gbench/mp/wave_equation_wide.cpp b/benchmarks/gbench/mp/wave_equation_wide.cpp index e411e4fcc8..b8fb28e083 100644 --- a/benchmarks/gbench/mp/wave_equation_wide.cpp +++ b/benchmarks/gbench/mp/wave_equation_wide.cpp @@ -70,7 +70,8 @@ double initial_elev(double x, double y, double lx, double ly) { return exact_elev(x, y, 0.0, lx, ly); } -void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, double dx_inv, double dy_inv, double dt) { +void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, + double dx_inv, double dy_inv, double dt) { /** * Evaluate right hand side of the equations */ @@ -95,7 +96,9 @@ void rhs(Array &u, Array &v, Array &e, Array &dudt, Array &dvdt, Array &dedt, do stencil_for_each_extended<2>(rhs_div, {1, 0}, {0, 0}, u, v, dedt); } -int run(std::size_t n, std::size_t redundancy, std::size_t steps, std::function iter_callback = []() {}) { +int run( + std::size_t n, std::size_t redundancy, std::size_t steps, + std::function iter_callback = []() {}) { // construct grid // number of cells in x, y direction std::size_t nx = n; @@ -104,9 +107,7 @@ int run(std::size_t n, std::size_t redundancy, std::size_t steps, std::function< const double ymin = -1, ymax = 1; ArakawaCGrid grid(xmin, xmax, ymin, ymax, nx, ny); - auto dist = dr::mp::distribution() - .halo(1) - .redundancy(redundancy); + auto dist = dr::mp::distribution().halo(1).redundancy(redundancy); // statistics std::size_t nread, nwrite, nflop; @@ -257,8 +258,9 @@ int run(std::size_t n, std::size_t redundancy, std::size_t steps, std::function< // step iter_callback(); - if ((i + 1) % redundancy == 0){ - // phase with communication - once after (redundancy - 1) steps without communication + if ((i + 1) % redundancy == 0) { + // phase with communication - once after (redundancy - 1) steps without + // communication dr::mp::halo(e).exchange(); dr::mp::halo(u).exchange(); dr::mp::halo(v).exchange(); @@ -291,7 +293,6 @@ int run(std::size_t n, std::size_t redundancy, std::size_t steps, std::function< dr::mp::halo(v2).exchange(); dr::mp::halo(e2).exchange(); - auto toc = std::chrono::steady_clock::now(); std::chrono::duration duration = toc - tic; if (comm_rank == 0) { @@ -383,8 +384,9 @@ int main(int argc, char *argv[]) { std::unique_ptr logfile; if (options.count("log")) { - logfile = std::make_unique(options["logprefix"].as() + - fmt::format(".{}.log", comm_rank)); + logfile = + std::make_unique(options["logprefix"].as() + + fmt::format(".{}.log", comm_rank)); dr::drlog.set_file(*logfile); } diff --git a/include/dr/mp/algorithms/for_each.hpp b/include/dr/mp/algorithms/for_each.hpp index 2b1bafe1c7..2b841be43b 100644 --- a/include/dr/mp/algorithms/for_each.hpp +++ b/include/dr/mp/algorithms/for_each.hpp @@ -6,17 +6,17 @@ #include #include +#include #include #include -#include #include #include #include #include #include -#include #include +#include namespace dr::mp { @@ -65,163 +65,166 @@ DI for_each_n(DI first, I n, auto op) { } namespace __detail { - template - using stencil_index_type = dr::__detail::dr_extents; - - void stencil_for_each_extended_1(auto op, stencil_index_type<1> begin, stencil_index_type<1> end, const auto& segs) { - auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); - - auto sub = [](auto a) { return std::get<1>(a) - std::get<0>(a); }; - auto is_zero = [](auto a) { return a != 0; }; - - auto zipped = zip_view(seg0_begin, seg0_end); - auto distance = zipped | std::views::transform(sub); - - if (rng::empty(distance | std::views::filter(is_zero))) - return; - - auto seg_infos = dr::__detail::tuple_transform(segs, [begin](auto &&seg) { - return std::make_pair(seg.begin() + seg.begin_stencil(begin)[0], seg.extents()); - }); - - auto do_point = [seg_infos, op](auto index) { - auto stencils = - dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { - return md::mdspan( - std::to_address(dr::ranges::local(seg_info.first + index)), - seg_info.second - ); - }); - op(stencils); - }; - if (mp::use_sycl()) { +template +using stencil_index_type = dr::__detail::dr_extents; + +void stencil_for_each_extended_1(auto op, stencil_index_type<1> begin, + stencil_index_type<1> end, const auto &segs) { + auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); + + auto sub = [](auto a) { return std::get<1>(a) - std::get<0>(a); }; + auto is_zero = [](auto a) { return a != 0; }; + + auto zipped = zip_view(seg0_begin, seg0_end); + auto distance = zipped | std::views::transform(sub); + + if (rng::empty(distance | std::views::filter(is_zero))) + return; + + auto seg_infos = dr::__detail::tuple_transform(segs, [begin](auto &&seg) { + return std::make_pair(seg.begin() + seg.begin_stencil(begin)[0], + seg.extents()); + }); + + auto do_point = [seg_infos, op](auto index) { + auto stencils = + dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { + return md::mdspan( + std::to_address(dr::ranges::local(seg_info.first + index)), + seg_info.second); + }); + op(stencils); + }; + if (mp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION - dr::__detail::parallel_for( - dr::mp::sycl_queue(), sycl::range<1>(distance[0]), - do_point) - .wait(); + dr::__detail::parallel_for(dr::mp::sycl_queue(), + sycl::range<1>(distance[0]), do_point) + .wait(); #else - assert(false); + assert(false); #endif - } else { - for (std::size_t i = 0; i < distance[0]; i++) { - do_point(i); - } + } else { + for (std::size_t i = 0; i < distance[0]; i++) { + do_point(i); } } +} - void stencil_for_each_extended_2(auto op, stencil_index_type<2>& begin, stencil_index_type<2> end, const auto& segs) { - auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); - - auto sub = [](auto a) { - auto x = std::get<0>(a); - auto y = std::get<1>(a); - return y > x ? y - x : 0; - }; - auto is_zero = [](auto a) { return a != 0; }; - - auto zipped = zip_view(seg0_begin, seg0_end); - auto distance = zipped | std::views::transform(sub); - - if (rng::empty(distance | std::views::filter(is_zero))) - return; - - auto seg_infos = dr::__detail::tuple_transform(segs, [&begin](auto &&seg) { - auto ext = seg.root_mdspan().extents(); - auto begin_stencil = seg.begin_stencil(begin); - return std::make_pair( - md::mdspan( - std::to_address(&seg.mdspan_extended()(begin_stencil[0], begin_stencil[1])), - ext - ), ext); - }); - - auto do_point = [seg_infos, op](auto index) { - auto stencils = - dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { - return md::mdspan( - std::to_address(&seg_info.first(index[0], index[1])), - seg_info.second); - }); - op(stencils); - }; - if (mp::use_sycl()) { +void stencil_for_each_extended_2(auto op, stencil_index_type<2> &begin, + stencil_index_type<2> end, const auto &segs) { + auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); + + auto sub = [](auto a) { + auto x = std::get<0>(a); + auto y = std::get<1>(a); + return y > x ? y - x : 0; + }; + auto is_zero = [](auto a) { return a != 0; }; + + auto zipped = zip_view(seg0_begin, seg0_end); + auto distance = zipped | std::views::transform(sub); + + if (rng::empty(distance | std::views::filter(is_zero))) + return; + + auto seg_infos = dr::__detail::tuple_transform(segs, [&begin](auto &&seg) { + auto ext = seg.root_mdspan().extents(); + auto begin_stencil = seg.begin_stencil(begin); + return std::make_pair(md::mdspan(std::to_address(&seg.mdspan_extended()( + begin_stencil[0], begin_stencil[1])), + ext), + ext); + }); + + auto do_point = [seg_infos, op](auto index) { + auto stencils = + dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { + return md::mdspan( + std::to_address(&seg_info.first(index[0], index[1])), + seg_info.second); + }); + op(stencils); + }; + if (mp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION - dr::__detail::parallel_for( - dr::mp::sycl_queue(), sycl::range<2>(distance[0], distance[1]), - do_point) - .wait(); + dr::__detail::parallel_for(dr::mp::sycl_queue(), + sycl::range<2>(distance[0], distance[1]), + do_point) + .wait(); #else - assert(false); + assert(false); #endif - } else { - for (std::size_t i = 0; i < distance[0]; i++) { - for (std::size_t j = 0; j < distance[1]; j++) { - do_point(stencil_index_type<2>{i, j}); - } + } else { + for (std::size_t i = 0; i < distance[0]; i++) { + for (std::size_t j = 0; j < distance[1]; j++) { + do_point(stencil_index_type<2>{i, j}); } } } +} - void stencil_for_each_extended_3(auto op, stencil_index_type<3>& begin, stencil_index_type<3> end, const auto& segs) { - auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); - - auto sub = [](auto a) { - auto x = std::get<0>(a); - auto y = std::get<1>(a); - return y > x ? y - x : 0; - }; - auto is_zero = [](auto a) { return a != 0; }; - - auto zipped = zip_view(seg0_begin, seg0_end); - auto distance = zipped | std::views::transform(sub); - - if (rng::empty(distance | std::views::filter(is_zero))) - return; - - auto seg_infos = dr::__detail::tuple_transform(segs, [&begin](auto &&seg) { - auto ext = seg.root_mdspan().extents(); - auto begin_stencil = seg.begin_stencil(begin); - return std::make_pair( - md::mdspan( - std::to_address(&seg.mdspan_extended()(begin_stencil[0], begin_stencil[1], begin_stencil[2])), - ext - ), ext); - }); - - auto do_point = [seg_infos, op](auto index) { - auto stencils = - dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { - return md::mdspan( - std::to_address(&seg_info.first(index[0], index[1], index[2])), - seg_info.second); - }); - op(stencils); - }; - if (mp::use_sycl()) { +void stencil_for_each_extended_3(auto op, stencil_index_type<3> &begin, + stencil_index_type<3> end, const auto &segs) { + auto [seg0_begin, seg0_end] = std::get<0>(segs).stencil(begin, end); + + auto sub = [](auto a) { + auto x = std::get<0>(a); + auto y = std::get<1>(a); + return y > x ? y - x : 0; + }; + auto is_zero = [](auto a) { return a != 0; }; + + auto zipped = zip_view(seg0_begin, seg0_end); + auto distance = zipped | std::views::transform(sub); + + if (rng::empty(distance | std::views::filter(is_zero))) + return; + + auto seg_infos = dr::__detail::tuple_transform(segs, [&begin](auto &&seg) { + auto ext = seg.root_mdspan().extents(); + auto begin_stencil = seg.begin_stencil(begin); + return std::make_pair( + md::mdspan(std::to_address(&seg.mdspan_extended()( + begin_stencil[0], begin_stencil[1], begin_stencil[2])), + ext), + ext); + }); + + auto do_point = [seg_infos, op](auto index) { + auto stencils = + dr::__detail::tuple_transform(seg_infos, [index](auto seg_info) { + return md::mdspan( + std::to_address(&seg_info.first(index[0], index[1], index[2])), + seg_info.second); + }); + op(stencils); + }; + if (mp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION - dr::__detail::parallel_for( - dr::mp::sycl_queue(), sycl::range<3>(distance[0], distance[1], distance[2]), - do_point) - .wait(); + dr::__detail::parallel_for( + dr::mp::sycl_queue(), + sycl::range<3>(distance[0], distance[1], distance[2]), do_point) + .wait(); #else - assert(false); + assert(false); #endif - } else { - for (std::size_t i = 0; i < distance[0]; i++) { - for (std::size_t j = 0; j < distance[1]; j++) { - for (std::size_t k = 0; k < distance[3]; k++) { - do_point(stencil_index_type<3>{i, j, k}); - } + } else { + for (std::size_t i = 0; i < distance[0]; i++) { + for (std::size_t j = 0; j < distance[1]; j++) { + for (std::size_t k = 0; k < distance[3]; k++) { + do_point(stencil_index_type<3>{i, j, k}); } } } } } +} // namespace __detail template -requires (1 <= Rank && Rank <= 3) -void stencil_for_each_extended(auto op, __detail::stencil_index_type begin, __detail::stencil_index_type end, dr::distributed_range auto &&...drs) { +requires(1 <= Rank && Rank <= 3) void stencil_for_each_extended( + auto op, __detail::stencil_index_type begin, + __detail::stencil_index_type end, + dr::distributed_range auto &&...drs) { auto ranges = std::tie(drs...); auto &&dr0 = std::get<0>(ranges); if (rng::empty(dr0)) { @@ -232,15 +235,13 @@ void stencil_for_each_extended(auto op, __detail::stencil_index_type begin for (const auto &segs : all_segments) { if constexpr (Rank == 1) { __detail::stencil_for_each_extended_1(op, begin, end, segs); - } - else if constexpr (Rank == 2) { + } else if constexpr (Rank == 2) { __detail::stencil_for_each_extended_2(op, begin, end, segs); - } - else if constexpr (Rank == 3) { + } else if constexpr (Rank == 3) { __detail::stencil_for_each_extended_3(op, begin, end, segs); - } - else { - static_assert(false, "Not supported"); // sycl for_each does not support more than 3 dimensions + } else { + static_assert(false, "Not supported"); // sycl for_each does not support + // more than 3 dimensions } } barrier(); diff --git a/include/dr/mp/algorithms/md_for_each.hpp b/include/dr/mp/algorithms/md_for_each.hpp index 9d92f0fe98..e0b29578e5 100644 --- a/include/dr/mp/algorithms/md_for_each.hpp +++ b/include/dr/mp/algorithms/md_for_each.hpp @@ -26,12 +26,12 @@ struct any { template concept one_argument = requires(F &f) { - { f(Arg1{}) }; + {f(Arg1{})}; }; template concept two_arguments = requires(F &f) { - { f(Arg1{}, Arg2{}) }; + {f(Arg1{}, Arg2{})}; }; }; // namespace dr::mp::__detail diff --git a/include/dr/mp/containers/distributed_mdarray.hpp b/include/dr/mp/containers/distributed_mdarray.hpp index 3ad0a25c11..8050b4b035 100644 --- a/include/dr/mp/containers/distributed_mdarray.hpp +++ b/include/dr/mp/containers/distributed_mdarray.hpp @@ -77,9 +77,9 @@ template class distributed_mdarray { shape_type tile_shape_; DV dv_; - using mdspan_type = - decltype(make_md_view(std::declval(), std::declval(), - std::declval(), std::declval())); + using mdspan_type = decltype(make_md_view( + std::declval(), std::declval(), + std::declval(), std::declval())); mdspan_type md_view_; distribution dist_; }; diff --git a/include/dr/mp/containers/distributed_vector.hpp b/include/dr/mp/containers/distributed_vector.hpp index 8e799b79d8..0c8af9d4c0 100644 --- a/include/dr/mp/containers/distributed_vector.hpp +++ b/include/dr/mp/containers/distributed_vector.hpp @@ -276,9 +276,8 @@ template class distributed_vector { void fence() { backend.fence(); } - const auto &dist() const { - return distribution_; - } + const auto &dist() const { return distribution_; } + private: void init(auto size, auto dist) { size_ = size; @@ -307,12 +306,9 @@ template class distributed_vector { std::size_t segment_index = 0; for (std::size_t i = 0; i < size; i += segment_size_) { - segments_.emplace_back( - this, - segment_index++, - std::min(segment_size_, size - i), - data_size_, - ext_dist); + segments_.emplace_back(this, segment_index++, + std::min(segment_size_, size - i), data_size_, + ext_dist); } fence(); diff --git a/include/dr/mp/containers/distribution.hpp b/include/dr/mp/containers/distribution.hpp index 2fe0049b07..6fa00145a1 100644 --- a/include/dr/mp/containers/distribution.hpp +++ b/include/dr/mp/containers/distribution.hpp @@ -62,8 +62,7 @@ struct extended_local_data_distribution { std::size_t segment_size; extended_local_data_distribution() = default; - extended_local_data_distribution(std::size_t segment_size, - std::size_t size, + extended_local_data_distribution(std::size_t segment_size, std::size_t size, halo_bounds hb) : segment_size(segment_size) { if (default_comm().rank() * segment_size >= hb.prev) diff --git a/include/dr/mp/containers/segment.hpp b/include/dr/mp/containers/segment.hpp index 1318ecf53d..69c0ace053 100644 --- a/include/dr/mp/containers/segment.hpp +++ b/include/dr/mp/containers/segment.hpp @@ -216,11 +216,13 @@ template class dv_segment { using iterator = dv_segment_iterator; using stencil_index_type = dr::__detail::dr_extents<1>; + public: using difference_type = std::ptrdiff_t; dv_segment() = default; dv_segment(DV *dv, std::size_t segment_index, std::size_t size, - std::size_t reserved, const extended_local_data_distribution& ext_dist) { + std::size_t reserved, + const extended_local_data_distribution &ext_dist) { dv_ = dv; segment_index_ = segment_index; size_ = size; @@ -242,13 +244,20 @@ template class dv_segment { auto end() const { return begin() + size(); } auto reserved() const { return reserved_; } - [[nodiscard]] stencil_index_type begin_stencil(stencil_index_type stencil) const { - return {std::min(std::max(begin_index_, ext_dist_.begin + stencil[0]), end_index_) - begin_index_}; + [[nodiscard]] stencil_index_type + begin_stencil(stencil_index_type stencil) const { + return {std::min(std::max(begin_index_, ext_dist_.begin + stencil[0]), + end_index_) - + begin_index_}; } - [[nodiscard]] stencil_index_type end_stencil(stencil_index_type stencil) const { - return {std::max(std::min(end_index_, ext_dist_.end - stencil[0]), begin_index_) - begin_index_}; + [[nodiscard]] stencil_index_type + end_stencil(stencil_index_type stencil) const { + return {std::max(std::min(end_index_, ext_dist_.end - stencil[0]), + begin_index_) - + begin_index_}; } - [[nodiscard]] std::pair stencil(stencil_index_type begin, stencil_index_type end) const { + [[nodiscard]] std::pair + stencil(stencil_index_type begin, stencil_index_type end) const { return {begin_stencil(begin), end_stencil(end)}; } auto extents() const { return md::extents(reserved_); } @@ -256,6 +265,7 @@ template class dv_segment { auto operator[](difference_type n) const { return *(begin() + n); } bool is_local() const { return segment_index_ == default_comm().rank(); } + private: DV *dv_ = nullptr; std::size_t segment_index_; @@ -273,7 +283,7 @@ template class dv_segment { // template concept has_halo_method = dr::distributed_range && requires(DR &&dr) { - { rng::begin(dr::ranges::segments(dr)[0]).halo() }; + {rng::begin(dr::ranges::segments(dr)[0]).halo()}; }; auto &halo(has_halo_method auto &&dr) { diff --git a/include/dr/mp/halo.hpp b/include/dr/mp/halo.hpp index cdf884f169..3d76fdfe7d 100644 --- a/include/dr/mp/halo.hpp +++ b/include/dr/mp/halo.hpp @@ -4,7 +4,7 @@ #pragma once -#include +#include #include +#include #include -#include diff --git a/include/dr/mp/halo/halo.hpp b/include/dr/mp/halo/halo.hpp index cfe1d49f08..a8d8b92419 100644 --- a/include/dr/mp/halo/halo.hpp +++ b/include/dr/mp/halo/halo.hpp @@ -9,185 +9,188 @@ namespace dr::mp { - enum class halo_tag { - invalid, - forward, - reverse, - index, - }; - - struct halo_bounds { - // How many values before and after the data segment are in halo - std::size_t prev = 0, next = 0; - bool periodic = false; - }; - - template - class halo_impl { - using T = typename Group::element_type; - using Memory = typename Group::memory_type; - - public: - using group_type = Group; - - // Destructor frees buffer_, so cannot copy - halo_impl(const halo_impl &) = delete; - - halo_impl operator=(const halo_impl &) = delete; - - /// halo constructor - halo_impl(communicator comm, const std::vector &owned_groups, - const std::vector &halo_groups, - const Memory &memory = Memory()) - : comm_(comm), halo_groups_(halo_groups), owned_groups_(owned_groups), - memory_(memory) { - DRLOG("Halo constructed with {}/{} owned/halo", rng::size(owned_groups), - rng::size(halo_groups)); - buffer_size_ = 0; - std::size_t i = 0; - std::vector buffer_index; - for (auto &g: owned_groups_) { - buffer_index.push_back(buffer_size_); - g.request_index = i++; - buffer_size_ += g.buffer_size(); - map_.push_back(&g); - } - for (auto &g: halo_groups_) { - buffer_index.push_back(buffer_size_); - g.request_index = i++; - buffer_size_ += g.buffer_size(); - map_.push_back(&g); - } - buffer_ = memory_.allocate(buffer_size_); - assert(buffer_ != nullptr); - i = 0; - for (auto &g: owned_groups_) { - g.buffer = &buffer_[buffer_index[i++]]; - } - for (auto &g: halo_groups_) { - g.buffer = &buffer_[buffer_index[i++]]; - } - requests_.resize(i); +enum class halo_tag { + invalid, + forward, + reverse, + index, +}; + +struct halo_bounds { + // How many values before and after the data segment are in halo + std::size_t prev = 0, next = 0; + bool periodic = false; +}; + +template class halo_impl { + using T = typename Group::element_type; + using Memory = typename Group::memory_type; + +public: + using group_type = Group; + + // Destructor frees buffer_, so cannot copy + halo_impl(const halo_impl &) = delete; + + halo_impl operator=(const halo_impl &) = delete; + + /// halo constructor + halo_impl(communicator comm, const std::vector &owned_groups, + const std::vector &halo_groups, + const Memory &memory = Memory()) + : comm_(comm), halo_groups_(halo_groups), owned_groups_(owned_groups), + memory_(memory) { + DRLOG("Halo constructed with {}/{} owned/halo", rng::size(owned_groups), + rng::size(halo_groups)); + buffer_size_ = 0; + std::size_t i = 0; + std::vector buffer_index; + for (auto &g : owned_groups_) { + buffer_index.push_back(buffer_size_); + g.request_index = i++; + buffer_size_ += g.buffer_size(); + map_.push_back(&g); } - - /// Begin a halo exchange - void exchange_begin() { - DRLOG("Halo exchange receiving"); - receive(halo_groups_); - DRLOG("Halo exchange sending"); - send(owned_groups_); - DRLOG("Halo exchange begin finished"); + for (auto &g : halo_groups_) { + buffer_index.push_back(buffer_size_); + g.request_index = i++; + buffer_size_ += g.buffer_size(); + map_.push_back(&g); } - - /// Complete a halo exchange - void exchange_finalize() { - DRLOG("Halo exchange finalize started"); - reduce_finalize(); - DRLOG("Halo exchange finalize finished"); + buffer_ = memory_.allocate(buffer_size_); + assert(buffer_ != nullptr); + i = 0; + for (auto &g : owned_groups_) { + g.buffer = &buffer_[buffer_index[i++]]; } - - void exchange() { - exchange_begin(); - exchange_finalize(); + for (auto &g : halo_groups_) { + g.buffer = &buffer_[buffer_index[i++]]; } + requests_.resize(i); + } - /// Begin a halo reduction - void reduce_begin() { - receive(owned_groups_); - send(halo_groups_); - } + /// Begin a halo exchange + void exchange_begin() { + DRLOG("Halo exchange receiving"); + receive(halo_groups_); + DRLOG("Halo exchange sending"); + send(owned_groups_); + DRLOG("Halo exchange begin finished"); + } - /// Complete a halo reduction - void reduce_finalize(const auto &op) { - for (int pending = rng::size(requests_); pending > 0; pending--) { - int completed; - MPI_Waitany(rng::size(requests_), requests_.data(), &completed, - MPI_STATUS_IGNORE); - DRLOG("reduce_finalize(op) waitany completed: {}", completed); - auto &g = *map_[completed]; - if (g.receive && g.buffered) { - g.unpack(op); - } + /// Complete a halo exchange + void exchange_finalize() { + DRLOG("Halo exchange finalize started"); + reduce_finalize(); + DRLOG("Halo exchange finalize finished"); + } + + void exchange() { + exchange_begin(); + exchange_finalize(); + } + + /// Begin a halo reduction + void reduce_begin() { + receive(owned_groups_); + send(halo_groups_); + } + + /// Complete a halo reduction + void reduce_finalize(const auto &op) { + for (int pending = rng::size(requests_); pending > 0; pending--) { + int completed; + MPI_Waitany(rng::size(requests_), requests_.data(), &completed, + MPI_STATUS_IGNORE); + DRLOG("reduce_finalize(op) waitany completed: {}", completed); + auto &g = *map_[completed]; + if (g.receive && g.buffered) { + g.unpack(op); } } + } - /// Complete a halo reduction - void reduce_finalize() { - for (int pending = rng::size(requests_); pending > 0; pending--) { - int completed; - MPI_Waitany(rng::size(requests_), requests_.data(), &completed, - MPI_STATUS_IGNORE); - DRLOG("reduce_finalize() waitany completed: {}", completed); - auto &g = *map_[completed]; - if (g.receive && g.buffered) { - g.unpack(); - } + /// Complete a halo reduction + void reduce_finalize() { + for (int pending = rng::size(requests_); pending > 0; pending--) { + int completed; + MPI_Waitany(rng::size(requests_), requests_.data(), &completed, + MPI_STATUS_IGNORE); + DRLOG("reduce_finalize() waitany completed: {}", completed); + auto &g = *map_[completed]; + if (g.receive && g.buffered) { + g.unpack(); } } + } - struct second_op { - T operator()(T &a, T &b) const { return b; } - } second; + struct second_op { + T operator()(T &a, T &b) const { return b; } + } second; - struct plus_op { - T operator()(T &a, T &b) const { return a + b; } - } plus; + struct plus_op { + T operator()(T &a, T &b) const { return a + b; } + } plus; - struct max_op { - T operator()(T &a, T &b) const { return std::max(a, b); } - } max; + struct max_op { + T operator()(T &a, T &b) const { return std::max(a, b); } + } max; - struct min_op { - T operator()(T &a, T &b) const { return std::min(a, b); } - } min; + struct min_op { + T operator()(T &a, T &b) const { return std::min(a, b); } + } min; - struct multiplies_op { - T operator()(T &a, T &b) const { return a * b; } - } multiplies; + struct multiplies_op { + T operator()(T &a, T &b) const { return a * b; } + } multiplies; - ~halo_impl() { - if (buffer_) { - memory_.deallocate(buffer_, buffer_size_); - buffer_ = nullptr; - } + ~halo_impl() { + if (buffer_) { + memory_.deallocate(buffer_, buffer_size_); + buffer_ = nullptr; } + } - private: - void send(std::vector &sends) { - for (auto &g: sends) { - g.pack(); - g.receive = false; - DRLOG("sending: {}", g.request_index); -// std::cout << "send(" << g.data_pointer() << ", " << g.data_size() << ", " << g.rank() << ", , " << &requests_[g.request_index] << ")\n"; - comm_.isend(g.data_pointer(), g.data_size(), g.rank(), g.tag(), - &requests_[g.request_index]); - } +private: + void send(std::vector &sends) { + for (auto &g : sends) { + g.pack(); + g.receive = false; + DRLOG("sending: {}", g.request_index); + // std::cout << "send(" << g.data_pointer() << ", " << + // g.data_size() << ", " << g.rank() << ", , " << + // &requests_[g.request_index] << ")\n"; + comm_.isend(g.data_pointer(), g.data_size(), g.rank(), g.tag(), + &requests_[g.request_index]); } + } - void receive(std::vector &receives) { - for (auto &g: receives) { - g.receive = true; - DRLOG("receiving: {}", g.request_index); -// std::cout << "recv(" << g.data_pointer() << ", " << g.data_size() << ", " << g.rank() << ", , " << &requests_[g.request_index] << ")\n"; - comm_.irecv(g.data_pointer(), g.data_size(), g.rank(), g.tag(), - &requests_[g.request_index]); - } + void receive(std::vector &receives) { + for (auto &g : receives) { + g.receive = true; + DRLOG("receiving: {}", g.request_index); + // std::cout << "recv(" << g.data_pointer() << ", " << + // g.data_size() << ", " << g.rank() << ", , " << + // &requests_[g.request_index] << ")\n"; + comm_.irecv(g.data_pointer(), g.data_size(), g.rank(), g.tag(), + &requests_[g.request_index]); } + } - communicator comm_; - std::vector halo_groups_, owned_groups_; - T *buffer_ = nullptr; - std::size_t buffer_size_; - std::vector requests_; - std::vector map_; - Memory memory_; - }; - - template - void halo_exchange(auto&& f, T &dv, Ts &...dvs) { - for (std::size_t step = 0; step < dv.dist().redundancy(); step++) { - f(dv, dvs...); - } - halo(dv).exchange(); + communicator comm_; + std::vector halo_groups_, owned_groups_; + T *buffer_ = nullptr; + std::size_t buffer_size_; + std::vector requests_; + std::vector map_; + Memory memory_; +}; + +template +void halo_exchange(auto &&f, T &dv, Ts &...dvs) { + for (std::size_t step = 0; step < dv.dist().redundancy(); step++) { + f(dv, dvs...); } + halo(dv).exchange(); } +} // namespace dr::mp diff --git a/include/dr/mp/halo/instance.hpp b/include/dr/mp/halo/instance.hpp index 4541962135..12fc980c49 100644 --- a/include/dr/mp/halo/instance.hpp +++ b/include/dr/mp/halo/instance.hpp @@ -5,47 +5,47 @@ #pragma once #include -#include -#include #include +#include +#include namespace dr::mp { -template -using unstructured_halo_impl = halo_impl >; +template +using unstructured_halo_impl = halo_impl>; -template> +template > class unstructured_halo : public unstructured_halo_impl { public: using group_type = index_group; - using index_map = std::pair >; + using index_map = std::pair>; /// /// Constructor /// unstructured_halo(communicator comm, T *data, - const std::vector &owned, - const std::vector &halo, + const std::vector &owned, + const std::vector &halo, const Memory &memory = Memory()) : unstructured_halo_impl( - comm, make_groups(comm, data, owned, memory), - make_groups(comm, data, halo, memory), memory) {} + comm, make_groups(comm, data, owned, memory), + make_groups(comm, data, halo, memory), memory) {} private: - static std::vector make_groups(communicator comm, T *data, - const std::vector &map, - const Memory &memory) { - std::vector groups; - for (auto const &[rank, indices]: map) { + static std::vector make_groups(communicator comm, T *data, + const std::vector &map, + const Memory &memory) { + std::vector groups; + for (auto const &[rank, indices] : map) { groups.emplace_back(data, rank, indices, memory); } return groups; } }; -template -using span_halo_impl = halo_impl >; +template +using span_halo_impl = halo_impl>; -template> +template > class span_halo : public span_halo_impl { public: using group_type = span_group; @@ -58,7 +58,7 @@ class span_halo : public span_halo_impl { check(size, hb); } - span_halo(communicator comm, std::span span, halo_bounds hb) + span_halo(communicator comm, std::span span, halo_bounds hb) : span_halo_impl(comm, owned_groups(comm, span, hb), halo_groups(comm, span, hb)) {} @@ -67,9 +67,9 @@ class span_halo : public span_halo_impl { assert(size >= hb.prev + hb.next + std::max(hb.prev, hb.next)); } - static std::vector - owned_groups(communicator comm, std::span span, halo_bounds hb) { - std::vector owned; + static std::vector + owned_groups(communicator comm, std::span span, halo_bounds hb) { + std::vector owned; DRLOG("owned groups {}/{} first/last", comm.first(), comm.last()); if (hb.next > 0 && (hb.periodic || !comm.first())) { owned.emplace_back(span.subspan(hb.prev, hb.next), comm.prev(), @@ -83,9 +83,9 @@ class span_halo : public span_halo_impl { return owned; } - static std::vector - halo_groups(communicator comm, std::span span, halo_bounds hb) { - std::vector halo; + static std::vector + halo_groups(communicator comm, std::span span, halo_bounds hb) { + std::vector halo; if (hb.prev > 0 && (hb.periodic || !comm.first())) { halo.emplace_back(span.first(hb.prev), comm.prev(), halo_tag::forward); } @@ -95,4 +95,4 @@ class span_halo : public span_halo_impl { return halo; } }; -} \ No newline at end of file +} // namespace dr::mp \ No newline at end of file diff --git a/include/dr/mp/views/mdspan_view.hpp b/include/dr/mp/views/mdspan_view.hpp index 3ffd58dc71..44da9f9bbe 100644 --- a/include/dr/mp/views/mdspan_view.hpp +++ b/include/dr/mp/views/mdspan_view.hpp @@ -26,11 +26,13 @@ template class md_segment : public rng::view_interface> { private: using stencil_index_type = dr::__detail::dr_extents; + public: using index_type = dr::__detail::dr_extents; md_segment() {} - md_segment(index_type origin, BaseSegment segment, index_type tile_shape, extended_local_data_distribution ext_dist) + md_segment(index_type origin, BaseSegment segment, index_type tile_shape, + extended_local_data_distribution ext_dist) : base_(segment), origin_(origin), mdspan_(local_tile(segment, tile_shape)), mdspan_extended_(local_tile_extended(segment, tile_shape)), @@ -51,23 +53,33 @@ class md_segment : public rng::view_interface> { auto halo() const { return dr::mp::halo(base_); } - [[nodiscard]] stencil_index_type begin_stencil(stencil_index_type stencil) const { + [[nodiscard]] stencil_index_type + begin_stencil(stencil_index_type stencil) const { stencil_index_type out; // Supports only 1d distribution for (std::size_t i = 0; i < Rank; i++) { - out[i] = std::min(std::max(origin_[i], (i == 0 ? ext_dist_.begin : origin_[i]) + stencil[i]), end_[i]) - origin_[i]; + out[i] = std::min(std::max(origin_[i], + (i == 0 ? ext_dist_.begin : origin_[i]) + + stencil[i]), + end_[i]) - + origin_[i]; } return out; } - [[nodiscard]] stencil_index_type end_stencil(stencil_index_type stencil) const { + [[nodiscard]] stencil_index_type + end_stencil(stencil_index_type stencil) const { stencil_index_type out; // Supports only 1d distribution for (std::size_t i = 0; i < Rank; i++) { - out[i] = std::max(std::min(end_[i], (i == 0 ? ext_dist_.end : end_[i]) - stencil[i]), origin_[i]) - origin_[i]; + out[i] = std::max(std::min(end_[i], (i == 0 ? ext_dist_.end : end_[i]) - + stencil[i]), + origin_[i]) - + origin_[i]; } return out; } - [[nodiscard]] std::pair stencil(stencil_index_type begin, stencil_index_type end) const { + [[nodiscard]] std::pair + stencil(stencil_index_type begin, stencil_index_type end) const { return {begin_stencil(begin), end_stencil(end)}; } @@ -89,7 +101,8 @@ class md_segment : public rng::view_interface> { return md::mdspan(ptr, tile_shape); } - static auto local_tile_extended(BaseSegment segment, const index_type &tile_shape) { + static auto local_tile_extended(BaseSegment segment, + const index_type &tile_shape) { T *ptr = std::to_address(dr::ranges::local(rng::begin(segment))); return md::mdspan(ptr, tile_shape); } @@ -98,7 +111,8 @@ class md_segment : public rng::view_interface> { index_type origin_; index_type end_; md::mdspan, md::layout_stride> mdspan_; - md::mdspan, md::layout_stride> mdspan_extended_; + md::mdspan, md::layout_stride> + mdspan_extended_; extended_local_data_distribution ext_dist_; }; @@ -143,8 +157,10 @@ struct mdspan_view : public rng::view_interface> { return origin; } - static auto make_segments(auto base, auto full_shape, auto tile_shape, auto dist) { - extended_local_data_distribution ext_dist(tile_shape[0], full_shape[0], dist.halo()); + static auto make_segments(auto base, auto full_shape, auto tile_shape, + auto dist) { + extended_local_data_distribution ext_dist(tile_shape[0], full_shape[0], + dist.halo()); auto make_md = [=](auto v) { auto clipped = tile_shape; @@ -165,8 +181,9 @@ struct mdspan_view : public rng::view_interface> { return dr::__detail::bounded_enumerate(dr::ranges::segments(base)) | rng::views::transform(make_md); } - using segments_type = decltype(make_segments(std::declval(), - full_shape_, tile_shape_, std::declval())); + using segments_type = + decltype(make_segments(std::declval(), full_shape_, + tile_shape_, std::declval())); public: mdspan_view(R r, dr::__detail::dr_extents full_shape, distribution dist) @@ -241,8 +258,9 @@ mdspan_view(R &&r, dr::__detail::dr_extents full_shape, -> mdspan_view, Rank>; template -concept is_mdspan_view = - dr::distributed_range && requires(R &r) { r.mdspan(); }; +concept is_mdspan_view = dr::distributed_range && requires(R &r) { + r.mdspan(); +}; } // namespace dr::mp @@ -253,7 +271,8 @@ template class mdspan_adapter_closure { mdspan_adapter_closure(dr::__detail::dr_extents full_shape, dr::__detail::dr_extents tile_shape, distribution dist) - : full_shape_(full_shape), tile_shape_(tile_shape), tile_valid_(true), dist_(dist) {} + : full_shape_(full_shape), tile_shape_(tile_shape), tile_valid_(true), + dist_(dist) {} mdspan_adapter_closure(dr::__detail::dr_extents full_shape, distribution dist) @@ -282,27 +301,28 @@ template class mdspan_adapter_closure { class mdspan_fn_ { public: template - auto operator()(R &&r, Shape &&full_shape, Shape &&tile_shape, distribution dist) const { + auto operator()(R &&r, Shape &&full_shape, Shape &&tile_shape, + distribution dist) const { return mdspan_adapter_closure(std::forward(full_shape), std::forward(tile_shape), - dist)( - std::forward(r)); + dist)(std::forward(r)); } template auto operator()(R &&r, Shape &&full_shape, distribution dist) const { - return mdspan_adapter_closure(std::forward(full_shape), dist)( - std::forward(r)); + return mdspan_adapter_closure(std::forward(full_shape), + dist)(std::forward(r)); } template - auto operator()(Shape &&full_shape, Shape &&tile_shape, distribution dist) const { + auto operator()(Shape &&full_shape, Shape &&tile_shape, + distribution dist) const { return mdspan_adapter_closure(std::forward(full_shape), - std::forward(tile_shape), - dist); + std::forward(tile_shape), dist); } - template auto operator()(Shape &&full_shape, distribution dist) const { + template + auto operator()(Shape &&full_shape, distribution dist) const { return mdspan_adapter_closure(std::forward(full_shape), dist); } }; diff --git a/test/gtest/mp/halo-3.cpp b/test/gtest/mp/halo-3.cpp index 95ca25cff7..b2131d430a 100644 --- a/test/gtest/mp/halo-3.cpp +++ b/test/gtest/mp/halo-3.cpp @@ -230,64 +230,64 @@ TYPED_TEST(Halo3, halo_wide) { fill(dv, 13); switch (dr::mp::default_comm().rank()) { - case 0: - EXPECT_EQ(*(dv.begin() + 0).local(), 13); - EXPECT_EQ(*(dv.begin() + 1).local(), 13); - EXPECT_EQ(*(dv.begin() + 2).local(), 13); - EXPECT_EQ(*(dv.begin() + 3).local(), 7); - EXPECT_EQ(*(dv.begin() + 4).local(), 7); - EXPECT_EQ(*(dv.begin() + 5).local(), 7); - break; - case 1: - EXPECT_EQ(*(dv.begin() + 0).local(), 7); - EXPECT_EQ(*(dv.begin() + 1).local(), 7); - EXPECT_EQ(*(dv.begin() + 2).local(), 7); - EXPECT_EQ(*(dv.begin() + 3).local(), 13); - EXPECT_EQ(*(dv.begin() + 4).local(), 13); - EXPECT_EQ(*(dv.begin() + 5).local(), 13); - EXPECT_EQ(*(dv.begin() + 6).local(), 7); - EXPECT_EQ(*(dv.begin() + 7).local(), 7); - EXPECT_EQ(*(dv.begin() + 8).local(), 7); - break; - case 2: - EXPECT_EQ(*(dv.begin() + 3).local(), 7); - EXPECT_EQ(*(dv.begin() + 4).local(), 7); - EXPECT_EQ(*(dv.begin() + 5).local(), 7); - EXPECT_EQ(*(dv.begin() + 6).local(), 13); - EXPECT_EQ(*(dv.begin() + 7).local(), 13); - EXPECT_EQ(*(dv.begin() + 8).local(), 13); - break; + case 0: + EXPECT_EQ(*(dv.begin() + 0).local(), 13); + EXPECT_EQ(*(dv.begin() + 1).local(), 13); + EXPECT_EQ(*(dv.begin() + 2).local(), 13); + EXPECT_EQ(*(dv.begin() + 3).local(), 7); + EXPECT_EQ(*(dv.begin() + 4).local(), 7); + EXPECT_EQ(*(dv.begin() + 5).local(), 7); + break; + case 1: + EXPECT_EQ(*(dv.begin() + 0).local(), 7); + EXPECT_EQ(*(dv.begin() + 1).local(), 7); + EXPECT_EQ(*(dv.begin() + 2).local(), 7); + EXPECT_EQ(*(dv.begin() + 3).local(), 13); + EXPECT_EQ(*(dv.begin() + 4).local(), 13); + EXPECT_EQ(*(dv.begin() + 5).local(), 13); + EXPECT_EQ(*(dv.begin() + 6).local(), 7); + EXPECT_EQ(*(dv.begin() + 7).local(), 7); + EXPECT_EQ(*(dv.begin() + 8).local(), 7); + break; + case 2: + EXPECT_EQ(*(dv.begin() + 3).local(), 7); + EXPECT_EQ(*(dv.begin() + 4).local(), 7); + EXPECT_EQ(*(dv.begin() + 5).local(), 7); + EXPECT_EQ(*(dv.begin() + 6).local(), 13); + EXPECT_EQ(*(dv.begin() + 7).local(), 13); + EXPECT_EQ(*(dv.begin() + 8).local(), 13); + break; } dv.halo().exchange(); switch (dr::mp::default_comm().rank()) { - case 0: - EXPECT_EQ(*(dv.begin() + 0).local(), 13); - EXPECT_EQ(*(dv.begin() + 1).local(), 13); - EXPECT_EQ(*(dv.begin() + 2).local(), 13); - EXPECT_EQ(*(dv.begin() + 3).local(), 13); - EXPECT_EQ(*(dv.begin() + 4).local(), 13); - EXPECT_EQ(*(dv.begin() + 5).local(), 13); - break; - case 1: - EXPECT_EQ(*(dv.begin() + 0).local(), 13); - EXPECT_EQ(*(dv.begin() + 1).local(), 13); - EXPECT_EQ(*(dv.begin() + 2).local(), 13); - EXPECT_EQ(*(dv.begin() + 3).local(), 13); - EXPECT_EQ(*(dv.begin() + 4).local(), 13); - EXPECT_EQ(*(dv.begin() + 5).local(), 13); - EXPECT_EQ(*(dv.begin() + 6).local(), 13); - EXPECT_EQ(*(dv.begin() + 7).local(), 13); - EXPECT_EQ(*(dv.begin() + 8).local(), 13); - break; - case 2: - EXPECT_EQ(*(dv.begin() + 3).local(), 13); - EXPECT_EQ(*(dv.begin() + 4).local(), 13); - EXPECT_EQ(*(dv.begin() + 5).local(), 13); - EXPECT_EQ(*(dv.begin() + 6).local(), 13); - EXPECT_EQ(*(dv.begin() + 7).local(), 13); - EXPECT_EQ(*(dv.begin() + 8).local(), 13); - break; + case 0: + EXPECT_EQ(*(dv.begin() + 0).local(), 13); + EXPECT_EQ(*(dv.begin() + 1).local(), 13); + EXPECT_EQ(*(dv.begin() + 2).local(), 13); + EXPECT_EQ(*(dv.begin() + 3).local(), 13); + EXPECT_EQ(*(dv.begin() + 4).local(), 13); + EXPECT_EQ(*(dv.begin() + 5).local(), 13); + break; + case 1: + EXPECT_EQ(*(dv.begin() + 0).local(), 13); + EXPECT_EQ(*(dv.begin() + 1).local(), 13); + EXPECT_EQ(*(dv.begin() + 2).local(), 13); + EXPECT_EQ(*(dv.begin() + 3).local(), 13); + EXPECT_EQ(*(dv.begin() + 4).local(), 13); + EXPECT_EQ(*(dv.begin() + 5).local(), 13); + EXPECT_EQ(*(dv.begin() + 6).local(), 13); + EXPECT_EQ(*(dv.begin() + 7).local(), 13); + EXPECT_EQ(*(dv.begin() + 8).local(), 13); + break; + case 2: + EXPECT_EQ(*(dv.begin() + 3).local(), 13); + EXPECT_EQ(*(dv.begin() + 4).local(), 13); + EXPECT_EQ(*(dv.begin() + 5).local(), 13); + EXPECT_EQ(*(dv.begin() + 6).local(), 13); + EXPECT_EQ(*(dv.begin() + 7).local(), 13); + EXPECT_EQ(*(dv.begin() + 8).local(), 13); + break; } } \ No newline at end of file diff --git a/test/gtest/mp/wide-halo-1d-3.cpp b/test/gtest/mp/wide-halo-1d-3.cpp index 20717ce74c..6f12bc14e1 100644 --- a/test/gtest/mp/wide-halo-1d-3.cpp +++ b/test/gtest/mp/wide-halo-1d-3.cpp @@ -13,14 +13,10 @@ const std::size_t redundancy = 2; const std::size_t size = 6; dr::mp::distribution get_distribution() { - return dr::mp::distribution() - .halo(1) - .redundancy(redundancy); + return dr::mp::distribution().halo(1).redundancy(redundancy); } -int& get(Array& v, std::size_t i) { - return *(v.begin() + i).local(); -} +int &get(Array &v, std::size_t i) { return *(v.begin() + i).local(); } TEST(WideHalo3, suite_works_for_3_processes_only) { EXPECT_EQ(dr::mp::default_comm().size(), 3); @@ -36,51 +32,57 @@ TEST(WideHalo3, halo_is_visible_after_exchange_not_earlier) { dv.halo().exchange(); dv_out.halo().exchange(); - auto print = [&](const auto& v) { + auto print = [&](const auto &v) { for (auto seg : v.segments()) { - for (auto i = seg.begin_stencil({0ul})[0]; i < seg.end_stencil({0ul})[0]; i++) { + for (auto i = seg.begin_stencil({0ul})[0]; i < seg.end_stencil({0ul})[0]; + i++) { std::cout << *(seg.begin() + i).local() << " "; } } std::cout << "\n"; }; - auto transform = [&]{ - stencil_for_each_extended<1>([](auto stencils){ - auto [x, x_out] = stencils; - x_out(0) = x(-1) + x(0) + x(1); - }, {1}, {1}, dv, dv_out); - stencil_for_each_extended<1>([](auto stencils){ - auto [x, x_out] = stencils; - x(0) = x_out(0); - }, {0}, {0}, dv, dv_out); + auto transform = [&] { + stencil_for_each_extended<1>( + [](auto stencils) { + auto [x, x_out] = stencils; + x_out(0) = x(-1) + x(0) + x(1); + }, + {1}, {1}, dv, dv_out); + stencil_for_each_extended<1>( + [](auto stencils) { + auto [x, x_out] = stencils; + x(0) = x_out(0); + }, + {0}, {0}, dv, dv_out); }; transform(); print(dv); - // after first step, only actually stored values and their neighbours are guaranteed to be correct + // after first step, only actually stored values and their neighbours are + // guaranteed to be correct switch (dr::mp::default_comm().rank()) { - case 0: - EXPECT_EQ(get(dv, 0), 1); - EXPECT_EQ(get(dv, 1), 3); - EXPECT_EQ(get(dv, 2), 3); - EXPECT_EQ(get(dv, 3), 1); - break; - case 1: - EXPECT_EQ(get(dv, 0), 1); - EXPECT_EQ(get(dv, 1), 3); - EXPECT_EQ(get(dv, 2), 3); - EXPECT_EQ(get(dv, 3), 3); - EXPECT_EQ(get(dv, 4), 3); - EXPECT_EQ(get(dv, 5), 1); - break; - case 2: - EXPECT_EQ(get(dv, 2), 1); - EXPECT_EQ(get(dv, 3), 3); - EXPECT_EQ(get(dv, 4), 3); - EXPECT_EQ(get(dv, 5), 1); - break; + case 0: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 3); + EXPECT_EQ(get(dv, 2), 3); + EXPECT_EQ(get(dv, 3), 1); + break; + case 1: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 3); + EXPECT_EQ(get(dv, 2), 3); + EXPECT_EQ(get(dv, 3), 3); + EXPECT_EQ(get(dv, 4), 3); + EXPECT_EQ(get(dv, 5), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2), 1); + EXPECT_EQ(get(dv, 3), 3); + EXPECT_EQ(get(dv, 4), 3); + EXPECT_EQ(get(dv, 5), 1); + break; } // after second step, only actually stored values are guaranteed to be correct @@ -89,26 +91,26 @@ TEST(WideHalo3, halo_is_visible_after_exchange_not_earlier) { print(dv); switch (dr::mp::default_comm().rank()) { - case 0: - EXPECT_EQ(get(dv, 0), 1); - EXPECT_EQ(get(dv, 1), 7); - EXPECT_EQ(get(dv, 2), 7); - EXPECT_EQ(get(dv, 3), 1); - break; - case 1: - EXPECT_EQ(get(dv, 0), 1); - EXPECT_EQ(get(dv, 1), 7); - EXPECT_EQ(get(dv, 2), 9); - EXPECT_EQ(get(dv, 3), 9); - EXPECT_EQ(get(dv, 4), 7); - EXPECT_EQ(get(dv, 5), 1); - break; - case 2: - EXPECT_EQ(get(dv, 2), 1); - EXPECT_EQ(get(dv, 3), 7); - EXPECT_EQ(get(dv, 4), 7); - EXPECT_EQ(get(dv, 5), 1); - break; + case 0: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 7); + EXPECT_EQ(get(dv, 3), 1); + break; + case 1: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2), 1); + EXPECT_EQ(get(dv, 3), 7); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; } // after exchange all are correct @@ -116,26 +118,26 @@ TEST(WideHalo3, halo_is_visible_after_exchange_not_earlier) { print(dv); switch (dr::mp::default_comm().rank()) { - case 0: - EXPECT_EQ(get(dv, 0), 1); - EXPECT_EQ(get(dv, 1), 7); - EXPECT_EQ(get(dv, 2), 9); - EXPECT_EQ(get(dv, 3), 9); - break; - case 1: - EXPECT_EQ(get(dv, 0), 1); - EXPECT_EQ(get(dv, 1), 7); - EXPECT_EQ(get(dv, 2), 9); - EXPECT_EQ(get(dv, 3), 9); - EXPECT_EQ(get(dv, 4), 7); - EXPECT_EQ(get(dv, 5), 1); - break; - case 2: - EXPECT_EQ(get(dv, 2), 9); - EXPECT_EQ(get(dv, 3), 9); - EXPECT_EQ(get(dv, 4), 7); - EXPECT_EQ(get(dv, 5), 1); - break; + case 0: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + break; + case 1: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; } } @@ -149,38 +151,43 @@ TEST(WideHalo3, halo_api_works) { dv.halo().exchange(); dv_out.halo().exchange(); - halo_exchange([](Array& dv, Array& dv_out){ - stencil_for_each_extended<1>([](auto stencils){ - auto [x, x_out] = stencils; - x_out(0) = x(-1) + x(0) + x(1); - }, {1}, {1}, dv, dv_out); - stencil_for_each_extended<1>([](auto stencils){ - auto [x, x_out] = stencils; - x(0) = x_out(0); - }, {0}, {0}, dv, dv_out); - }, dv, dv_out); + halo_exchange( + [](Array &dv, Array &dv_out) { + stencil_for_each_extended<1>( + [](auto stencils) { + auto [x, x_out] = stencils; + x_out(0) = x(-1) + x(0) + x(1); + }, + {1}, {1}, dv, dv_out); + stencil_for_each_extended<1>( + [](auto stencils) { + auto [x, x_out] = stencils; + x(0) = x_out(0); + }, + {0}, {0}, dv, dv_out); + }, + dv, dv_out); // after exchange all are correct switch (dr::mp::default_comm().rank()) { - case 0: - EXPECT_EQ(get(dv, 0), 1); - EXPECT_EQ(get(dv, 1), 7); - EXPECT_EQ(get(dv, 2), 9); - EXPECT_EQ(get(dv, 3), 9); - break; - case 1: - EXPECT_EQ(get(dv, 0), 1); - EXPECT_EQ(get(dv, 1), 7); - EXPECT_EQ(get(dv, 2), 9); - EXPECT_EQ(get(dv, 3), 9); - EXPECT_EQ(get(dv, 4), 7); - EXPECT_EQ(get(dv, 5), 1); - break; - case 2: - EXPECT_EQ(get(dv, 2), 9); - EXPECT_EQ(get(dv, 3), 9); - EXPECT_EQ(get(dv, 4), 7); - EXPECT_EQ(get(dv, 5), 1); - break; + case 0: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + break; + case 1: + EXPECT_EQ(get(dv, 0), 1); + EXPECT_EQ(get(dv, 1), 7); + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2), 9); + EXPECT_EQ(get(dv, 3), 9); + EXPECT_EQ(get(dv, 4), 7); + EXPECT_EQ(get(dv, 5), 1); + break; } } - diff --git a/test/gtest/mp/wide-halo-2d-3.cpp b/test/gtest/mp/wide-halo-2d-3.cpp index e5d7b90759..c0e7acf716 100644 --- a/test/gtest/mp/wide-halo-2d-3.cpp +++ b/test/gtest/mp/wide-halo-2d-3.cpp @@ -13,12 +13,10 @@ const std::size_t redundancy = 2; const std::array size = {6, 6}; dr::mp::distribution get_distribution() { - return dr::mp::distribution() - .halo(1) - .redundancy(redundancy); + return dr::mp::distribution().halo(1).redundancy(redundancy); } -int& get(Array& v, std::size_t i, std::size_t j) { +int &get(Array &v, std::size_t i, std::size_t j) { return *(v.begin() + i * size[0] + j).local(); } @@ -36,22 +34,26 @@ TEST(WideHalo3, halo2d_is_visible_after_exchange_not_earlier) { dv.halo().exchange(); dv_out.halo().exchange(); - auto transform = [&]{ - stencil_for_each_extended<2>([](auto stencils){ - auto [x, x_out] = stencils; - x_out(0, 0) = 0; - for (int i = -1; i <= 1; i++) { - for (int j = -1; j <= 1; j++) { - x_out(0, 0) += x(i, j); - } - } - }, {1, 1}, {1, 1}, dv, dv_out); - stencil_for_each_extended<2>([](auto stencils){ - auto [x, x_out] = stencils; - x(0, 0) = x_out(0, 0); - }, {0, 0}, {0, 0}, dv, dv_out); + auto transform = [&] { + stencil_for_each_extended<2>( + [](auto stencils) { + auto [x, x_out] = stencils; + x_out(0, 0) = 0; + for (int i = -1; i <= 1; i++) { + for (int j = -1; j <= 1; j++) { + x_out(0, 0) += x(i, j); + } + } + }, + {1, 1}, {1, 1}, dv, dv_out); + stencil_for_each_extended<2>( + [](auto stencils) { + auto [x, x_out] = stencils; + x(0, 0) = x_out(0, 0); + }, + {0, 0}, {0, 0}, dv, dv_out); }; - auto print = [](std::string s, const auto& v) { + auto print = [](std::string s, const auto &v) { std::cout << s << "\n"; for (auto seg : v.segments()) { auto [beg, end] = seg.stencil({0, 0}, {0, 0}); @@ -69,68 +71,69 @@ TEST(WideHalo3, halo2d_is_visible_after_exchange_not_earlier) { transform(); print("dv", dv); - // after first step, only actually stored values and their neighbours are guaranteed to be correct + // after first step, only actually stored values and their neighbours are + // guaranteed to be correct switch (dr::mp::default_comm().rank()) { - case 0: - EXPECT_EQ(get(dv, 0, 1), 1); - EXPECT_EQ(get(dv, 1, 1), 9); - EXPECT_EQ(get(dv, 2, 1), 9); - EXPECT_EQ(get(dv, 3, 1), 1); - break; - case 1: - EXPECT_EQ(get(dv, 0, 1), 1); - EXPECT_EQ(get(dv, 1, 1), 9); - EXPECT_EQ(get(dv, 2, 1), 9); - EXPECT_EQ(get(dv, 3, 1), 9); - EXPECT_EQ(get(dv, 4, 1), 9); - EXPECT_EQ(get(dv, 5, 1), 1); - break; - case 2: - EXPECT_EQ(get(dv, 2, 1), 1); - EXPECT_EQ(get(dv, 3, 1), 9); - EXPECT_EQ(get(dv, 4, 1), 9); - EXPECT_EQ(get(dv, 5, 1), 1); - break; + case 0: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 9); + EXPECT_EQ(get(dv, 2, 1), 9); + EXPECT_EQ(get(dv, 3, 1), 1); + break; + case 1: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 9); + EXPECT_EQ(get(dv, 2, 1), 9); + EXPECT_EQ(get(dv, 3, 1), 9); + EXPECT_EQ(get(dv, 4, 1), 9); + EXPECT_EQ(get(dv, 5, 1), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2, 1), 1); + EXPECT_EQ(get(dv, 3, 1), 9); + EXPECT_EQ(get(dv, 4, 1), 9); + EXPECT_EQ(get(dv, 5, 1), 1); + break; } transform(); print("dv", dv); // after second step, only actually stored values are guaranteed to be correct switch (dr::mp::default_comm().rank()) { - case 0: - EXPECT_EQ(get(dv, 0, 1), 1); - EXPECT_EQ(get(dv, 1, 1), 41); - EXPECT_EQ(get(dv, 2, 1), 41); - EXPECT_EQ(get(dv, 3, 1), 1); - EXPECT_EQ(get(dv, 0, 2), 1); - EXPECT_EQ(get(dv, 1, 2), 57); - EXPECT_EQ(get(dv, 2, 2), 57); - EXPECT_EQ(get(dv, 3, 2), 1); - break; - case 1: - EXPECT_EQ(get(dv, 0, 1), 1); - EXPECT_EQ(get(dv, 1, 1), 41); - EXPECT_EQ(get(dv, 2, 1), 57); - EXPECT_EQ(get(dv, 3, 1), 57); - EXPECT_EQ(get(dv, 4, 1), 41); - EXPECT_EQ(get(dv, 5, 1), 1); - EXPECT_EQ(get(dv, 0, 2), 1); - EXPECT_EQ(get(dv, 1, 2), 57); - EXPECT_EQ(get(dv, 2, 2), 81); - EXPECT_EQ(get(dv, 3, 2), 81); - EXPECT_EQ(get(dv, 4, 2), 57); - EXPECT_EQ(get(dv, 5, 2), 1); - break; - case 2: - EXPECT_EQ(get(dv, 2, 1), 1); - EXPECT_EQ(get(dv, 3, 1), 41); - EXPECT_EQ(get(dv, 4, 1), 41); - EXPECT_EQ(get(dv, 5, 1), 1); - EXPECT_EQ(get(dv, 2, 2), 1); - EXPECT_EQ(get(dv, 3, 2), 57); - EXPECT_EQ(get(dv, 4, 2), 57); - EXPECT_EQ(get(dv, 5, 2), 1); - break; + case 0: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 41); + EXPECT_EQ(get(dv, 3, 1), 1); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 57); + EXPECT_EQ(get(dv, 3, 2), 1); + break; + case 1: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2, 1), 1); + EXPECT_EQ(get(dv, 3, 1), 41); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 2, 2), 1); + EXPECT_EQ(get(dv, 3, 2), 57); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; } dv.halo().exchange(); @@ -138,40 +141,40 @@ TEST(WideHalo3, halo2d_is_visible_after_exchange_not_earlier) { print("dv", dv); // after exchange all are correct switch (dr::mp::default_comm().rank()) { - case 0: - EXPECT_EQ(get(dv, 0, 1), 1); - EXPECT_EQ(get(dv, 1, 1), 41); - EXPECT_EQ(get(dv, 2, 1), 57); - EXPECT_EQ(get(dv, 3, 1), 57); - EXPECT_EQ(get(dv, 0, 2), 1); - EXPECT_EQ(get(dv, 1, 2), 57); - EXPECT_EQ(get(dv, 2, 2), 81); - EXPECT_EQ(get(dv, 3, 2), 81); - break; - case 1: - EXPECT_EQ(get(dv, 0, 1), 1); - EXPECT_EQ(get(dv, 1, 1), 41); - EXPECT_EQ(get(dv, 2, 1), 57); - EXPECT_EQ(get(dv, 3, 1), 57); - EXPECT_EQ(get(dv, 4, 1), 41); - EXPECT_EQ(get(dv, 5, 1), 1); - EXPECT_EQ(get(dv, 0, 2), 1); - EXPECT_EQ(get(dv, 1, 2), 57); - EXPECT_EQ(get(dv, 2, 2), 81); - EXPECT_EQ(get(dv, 3, 2), 81); - EXPECT_EQ(get(dv, 4, 2), 57); - EXPECT_EQ(get(dv, 5, 2), 1); - break; - case 2: - EXPECT_EQ(get(dv, 2, 1), 57); - EXPECT_EQ(get(dv, 3, 1), 57); - EXPECT_EQ(get(dv, 4, 1), 41); - EXPECT_EQ(get(dv, 5, 1), 1); - EXPECT_EQ(get(dv, 2, 2), 81); - EXPECT_EQ(get(dv, 3, 2), 81); - EXPECT_EQ(get(dv, 4, 2), 57); - EXPECT_EQ(get(dv, 5, 2), 1); - break; + case 0: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + break; + case 1: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; } } @@ -185,7 +188,7 @@ TEST(WideHalo3, halo2d_api_works) { dv.halo().exchange(); dv_out.halo().exchange(); - auto print = [](std::string s, const auto& v) { + auto print = [](std::string s, const auto &v) { std::cout << s << "\n"; for (auto seg : v.segments()) { auto [beg, end] = seg.stencil({0, 0}, {0, 0}); @@ -201,58 +204,64 @@ TEST(WideHalo3, halo2d_api_works) { print("dv", dv); - halo_exchange([](Array& dv, Array& dv_out){ - stencil_for_each_extended<2>([](auto stencils){ - auto [x, x_out] = stencils; - x_out(0, 0) = 0; - for (int i = -1; i <= 1; i++) { - for (int j = -1; j <= 1; j++) { - x_out(0, 0) += x(i, j); - } - } - }, {1, 1}, {1, 1}, dv, dv_out); - stencil_for_each_extended<2>([](auto stencils){ - auto [x, x_out] = stencils; - x(0, 0) = x_out(0, 0); - }, {0, 0}, {0, 0}, dv, dv_out); - }, dv, dv_out); + halo_exchange( + [](Array &dv, Array &dv_out) { + stencil_for_each_extended<2>( + [](auto stencils) { + auto [x, x_out] = stencils; + x_out(0, 0) = 0; + for (int i = -1; i <= 1; i++) { + for (int j = -1; j <= 1; j++) { + x_out(0, 0) += x(i, j); + } + } + }, + {1, 1}, {1, 1}, dv, dv_out); + stencil_for_each_extended<2>( + [](auto stencils) { + auto [x, x_out] = stencils; + x(0, 0) = x_out(0, 0); + }, + {0, 0}, {0, 0}, dv, dv_out); + }, + dv, dv_out); print("dv", dv); // after exchange all are correct switch (dr::mp::default_comm().rank()) { - case 0: - EXPECT_EQ(get(dv, 0, 1), 1); - EXPECT_EQ(get(dv, 1, 1), 41); - EXPECT_EQ(get(dv, 2, 1), 57); - EXPECT_EQ(get(dv, 3, 1), 57); - EXPECT_EQ(get(dv, 0, 2), 1); - EXPECT_EQ(get(dv, 1, 2), 57); - EXPECT_EQ(get(dv, 2, 2), 81); - EXPECT_EQ(get(dv, 3, 2), 81); - break; - case 1: - EXPECT_EQ(get(dv, 0, 1), 1); - EXPECT_EQ(get(dv, 1, 1), 41); - EXPECT_EQ(get(dv, 2, 1), 57); - EXPECT_EQ(get(dv, 3, 1), 57); - EXPECT_EQ(get(dv, 4, 1), 41); - EXPECT_EQ(get(dv, 5, 1), 1); - EXPECT_EQ(get(dv, 0, 2), 1); - EXPECT_EQ(get(dv, 1, 2), 57); - EXPECT_EQ(get(dv, 2, 2), 81); - EXPECT_EQ(get(dv, 3, 2), 81); - EXPECT_EQ(get(dv, 4, 2), 57); - EXPECT_EQ(get(dv, 5, 2), 1); - break; - case 2: - EXPECT_EQ(get(dv, 2, 1), 57); - EXPECT_EQ(get(dv, 3, 1), 57); - EXPECT_EQ(get(dv, 4, 1), 41); - EXPECT_EQ(get(dv, 5, 1), 1); - EXPECT_EQ(get(dv, 2, 2), 81); - EXPECT_EQ(get(dv, 3, 2), 81); - EXPECT_EQ(get(dv, 4, 2), 57); - EXPECT_EQ(get(dv, 5, 2), 1); - break; + case 0: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + break; + case 1: + EXPECT_EQ(get(dv, 0, 1), 1); + EXPECT_EQ(get(dv, 1, 1), 41); + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 0, 2), 1); + EXPECT_EQ(get(dv, 1, 2), 57); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; + case 2: + EXPECT_EQ(get(dv, 2, 1), 57); + EXPECT_EQ(get(dv, 3, 1), 57); + EXPECT_EQ(get(dv, 4, 1), 41); + EXPECT_EQ(get(dv, 5, 1), 1); + EXPECT_EQ(get(dv, 2, 2), 81); + EXPECT_EQ(get(dv, 3, 2), 81); + EXPECT_EQ(get(dv, 4, 2), 57); + EXPECT_EQ(get(dv, 5, 2), 1); + break; } } From f8d0434c3ee56b2bf94318f955057a56bb7ca3bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ch=C4=99tkowski?= Date: Fri, 8 Nov 2024 20:45:41 +0100 Subject: [PATCH 09/19] Fix after clang-format --- include/dr/mp/halo/format.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/dr/mp/halo/format.hpp b/include/dr/mp/halo/format.hpp index 6c329ae63c..2b602c5db6 100644 --- a/include/dr/mp/halo/format.hpp +++ b/include/dr/mp/halo/format.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include #ifdef DR_FORMAT From 45ba1abca593940b79d9864e6c7c9b46aeb92632 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ch=C4=99tkowski?= Date: Sun, 24 Nov 2024 15:29:43 +0100 Subject: [PATCH 10/19] Fix association of tests to cmake target groups and missing EOLs before EOFs --- include/dr/mp/halo/instance.hpp | 2 +- test/gtest/mp/CMakeLists.txt | 4 +--- test/gtest/mp/mdstar.cpp | 18 +++++++++--------- test/gtest/mp/wide-halo-1d-3.cpp | 16 ++++++++-------- test/gtest/mp/wide-halo-2d-3.cpp | 16 ++++++++-------- 5 files changed, 27 insertions(+), 29 deletions(-) diff --git a/include/dr/mp/halo/instance.hpp b/include/dr/mp/halo/instance.hpp index 12fc980c49..04be307690 100644 --- a/include/dr/mp/halo/instance.hpp +++ b/include/dr/mp/halo/instance.hpp @@ -95,4 +95,4 @@ class span_halo : public span_halo_impl { return halo; } }; -} // namespace dr::mp \ No newline at end of file +} // namespace dr::mp diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index d38909270e..e845b128d1 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -43,9 +43,7 @@ add_executable( stencil.cpp segments.cpp slide_view.cpp - wave_kernel.cpp - wide-halo-1d-3.cpp - wide-halo-2d-3.cpp) + wave_kernel.cpp) add_executable( mp-tests-3 diff --git a/test/gtest/mp/mdstar.cpp b/test/gtest/mp/mdstar.cpp index 0e964a0d18..7670a0717e 100644 --- a/test/gtest/mp/mdstar.cpp +++ b/test/gtest/mp/mdstar.cpp @@ -35,7 +35,7 @@ class Mdspan : public ::testing::Test { TEST_F(Mdspan, StaticAssert) { xp::distributed_vector dist(n2d, dist2d_1d); - auto mdspan = xp::views::mdspan(dist, extents2d); + auto mdspan = xp::views::mdspan(dist, extents2d, dist2d_1d); static_assert(rng::forward_range); static_assert(dr::distributed_range); auto segments = dr::ranges::segments(mdspan); @@ -47,7 +47,7 @@ TEST_F(Mdspan, StaticAssert) { TEST_F(Mdspan, Iterator) { xp::distributed_vector dist(n2d, dist2d_1d); - auto mdspan = xp::views::mdspan(dist, extents2d); + auto mdspan = xp::views::mdspan(dist, extents2d, dist2d_1d); *mdspan.begin() = 17; xp::fence(); @@ -57,7 +57,7 @@ TEST_F(Mdspan, Iterator) { TEST_F(Mdspan, Mdindex2D) { xp::distributed_vector dist(n2d, dist2d_1d); - auto dmdspan = xp::views::mdspan(dist, extents2d); + auto dmdspan = xp::views::mdspan(dist, extents2d, dist2d_1d); std::size_t i = 1, j = 2; dmdspan.mdspan()(i, j) = 17; @@ -68,7 +68,7 @@ TEST_F(Mdspan, Mdindex2D) { TEST_F(Mdspan, Mdindex3D) { xp::distributed_vector dist(n3d, dist3d_1d); - auto dmdspan = xp::views::mdspan(dist, extents3d); + auto dmdspan = xp::views::mdspan(dist, extents3d, dist3d_1d); std::size_t i = 1, j = 2, k = 0; dmdspan.mdspan()(i, j, k) = 17; @@ -79,7 +79,7 @@ TEST_F(Mdspan, Mdindex3D) { TEST_F(Mdspan, Pipe) { xp::distributed_vector dist(n2d, dist2d_1d); - auto mdspan = dist | xp::views::mdspan(extents2d); + auto mdspan = dist | xp::views::mdspan(extents2d, dist2d_1d); *mdspan.begin() = 17; xp::fence(); @@ -89,7 +89,7 @@ TEST_F(Mdspan, Pipe) { TEST_F(Mdspan, SegmentExtents) { xp::distributed_vector dist(n2d, dist2d_1d); - auto dmdspan = xp::views::mdspan(dist, extents2d); + auto dmdspan = xp::views::mdspan(dist, extents2d, dist2d_1d); // Sum of leading dimension matches original std::size_t x = 0; @@ -106,7 +106,7 @@ TEST_F(Mdspan, Subrange) { xp::distributed_vector dist(n2d, dist2d_1d); auto inner = rng::subrange(dist.begin() + ydim, dist.end() - ydim); std::array inner_extents({extents2d[0] - 2, extents2d[1]}); - auto dmdspan = xp::views::mdspan(inner, inner_extents); + auto dmdspan = xp::views::mdspan(inner, inner_extents, dist2d_1d); // Summing up leading dimension size of segments should equal // original minus 2 rows @@ -123,7 +123,7 @@ TEST_F(Mdspan, Subrange) { TEST_F(Mdspan, GridExtents) { xp::distributed_vector dist(n2d, dist2d_1d); xp::iota(dist, 100); - auto dmdspan = xp::views::mdspan(dist, extents2d); + auto dmdspan = xp::views::mdspan(dist, extents2d, dist2d_1d); auto grid = dmdspan.grid(); auto x = 0; @@ -147,7 +147,7 @@ TEST_F(Mdspan, GridLocalReference) { xp::distributed_vector dist(n2d, dist2d_1d); xp::iota(dist, 100); - auto dmdspan = xp::views::mdspan(dist, extents2d); + auto dmdspan = xp::views::mdspan(dist, extents2d, dist2d_1d); auto grid = dmdspan.grid(); auto tile = grid(0, 0).mdspan(); diff --git a/test/gtest/mp/wide-halo-1d-3.cpp b/test/gtest/mp/wide-halo-1d-3.cpp index 6f12bc14e1..686e09dcf1 100644 --- a/test/gtest/mp/wide-halo-1d-3.cpp +++ b/test/gtest/mp/wide-halo-1d-3.cpp @@ -4,25 +4,25 @@ #include "xp-tests.hpp" -template class WideHalo3 : public testing::Test {}; +template class WideHalo3_1D : public testing::Test {}; using T = int; using Array = dr::mp::distributed_vector; -const std::size_t redundancy = 2; -const std::size_t size = 6; +static const std::size_t redundancy = 2; +static const std::size_t size = 6; -dr::mp::distribution get_distribution() { +static dr::mp::distribution get_distribution() { return dr::mp::distribution().halo(1).redundancy(redundancy); } -int &get(Array &v, std::size_t i) { return *(v.begin() + i).local(); } +static int &get(Array &v, std::size_t i) { return *(v.begin() + i).local(); } -TEST(WideHalo3, suite_works_for_3_processes_only) { +TEST(WideHalo3_1D, suite_works_for_3_processes_only) { EXPECT_EQ(dr::mp::default_comm().size(), 3); } -TEST(WideHalo3, halo_is_visible_after_exchange_not_earlier) { +TEST(WideHalo3_1D, halo_is_visible_after_exchange_not_earlier) { dr::mp::distribution dist = get_distribution(); Array dv(size, dist); Array dv_out(size, dist); @@ -141,7 +141,7 @@ TEST(WideHalo3, halo_is_visible_after_exchange_not_earlier) { } } -TEST(WideHalo3, halo_api_works) { +TEST(WideHalo3_1D, halo_api_works) { dr::mp::distribution dist = get_distribution(); Array dv(size, dist); Array dv_out(size, dist); diff --git a/test/gtest/mp/wide-halo-2d-3.cpp b/test/gtest/mp/wide-halo-2d-3.cpp index c0e7acf716..08ce9fcc96 100644 --- a/test/gtest/mp/wide-halo-2d-3.cpp +++ b/test/gtest/mp/wide-halo-2d-3.cpp @@ -4,27 +4,27 @@ #include "xp-tests.hpp" -template class WideHalo3 : public testing::Test {}; +template class WideHalo3_2D : public testing::Test {}; using T = int; using Array = dr::mp::distributed_mdarray; -const std::size_t redundancy = 2; -const std::array size = {6, 6}; +static const std::size_t redundancy = 2; +static const std::array size = {6, 6}; -dr::mp::distribution get_distribution() { +static dr::mp::distribution get_distribution() { return dr::mp::distribution().halo(1).redundancy(redundancy); } -int &get(Array &v, std::size_t i, std::size_t j) { +static int &get(Array &v, std::size_t i, std::size_t j) { return *(v.begin() + i * size[0] + j).local(); } -TEST(WideHalo3, suite_works_for_3_processes_only) { +TEST(WideHalo3_2D, suite_works_for_3_processes_only) { EXPECT_EQ(dr::mp::default_comm().size(), 3); } -TEST(WideHalo3, halo2d_is_visible_after_exchange_not_earlier) { +TEST(WideHalo3_2D, halo2d_is_visible_after_exchange_not_earlier) { dr::mp::distribution dist = get_distribution(); Array dv(size, dist); Array dv_out(size, dist); @@ -178,7 +178,7 @@ TEST(WideHalo3, halo2d_is_visible_after_exchange_not_earlier) { } } -TEST(WideHalo3, halo2d_api_works) { +TEST(WideHalo3_2D, halo2d_api_works) { dr::mp::distribution dist = get_distribution(); Array dv(size, dist); Array dv_out(size, dist); From b330fe7b88505c6643de4df30b5ddc375c4562f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ch=C4=99tkowski?= Date: Sun, 24 Nov 2024 15:36:52 +0100 Subject: [PATCH 11/19] Clang format fixes --- include/dr/mp/alignment.hpp | 4 +++- test/gtest/mp/CMakeLists.txt | 41 +++++++++++++++++++----------------- test/gtest/mp/copy.cpp | 4 +--- test/gtest/mp/reduce.cpp | 4 +--- test/gtest/mp/segments.cpp | 8 ++----- test/gtest/mp/xp-tests.hpp | 18 ++++++++-------- 6 files changed, 38 insertions(+), 41 deletions(-) diff --git a/include/dr/mp/alignment.hpp b/include/dr/mp/alignment.hpp index 71ce55dcc9..790e9620a4 100644 --- a/include/dr/mp/alignment.hpp +++ b/include/dr/mp/alignment.hpp @@ -11,7 +11,9 @@ namespace dr::mp { template -concept has_segments = requires(T &t) { dr::ranges::segments(t); }; +concept has_segments = requires(T &t) { + dr::ranges::segments(t); +}; template concept no_segments = !has_segments; diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index e845b128d1..28b4eef43f 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -1,11 +1,11 @@ -# SPDX-FileCopyrightText: Intel Corporation +#SPDX - FileCopyrightText : Intel Corporation # -# SPDX-License-Identifier: BSD-3-Clause +#SPDX - License - Identifier : BSD - 3 - Clause set(CMAKE_INCLUDE_CURRENT_DIR ON) -# tested with a variable number of ranks -# cmake-format: off +#tested with a variable number of ranks +#cmake - format : off add_executable( mp-tests mp-tests.cpp @@ -17,10 +17,10 @@ add_executable( ../common/drop.cpp ../common/enumerate.cpp ../common/equal.cpp -# ../common/exclusive_scan.cpp disabled due to deadlock - DRA-213 +#../ common / exclusive_scan.cpp disabled due to deadlock - DRA - 213 ../common/fill.cpp ../common/for_each.cpp -# ../common/inclusive_scan.cpp disabled due to deadlock - DRA-213 +#../ common / inclusive_scan.cpp disabled due to deadlock - DRA - 213 ../common/iota.cpp ../common/iota_view.cpp ../common/reduce.cpp @@ -55,9 +55,11 @@ add_executable( wide-halo-2d-3.cpp ) -# mp-quick-test and mp-quick-test-3-only is for development. By reducing the number of source files, it -# builds much faster. Change the source files to match what you need to test. It -# is OK to commit changes to the source file list. +#mp - quick - test and mp - quick - test - 3 - \ + only is for development.By reducing the number of source files, \ + it +#builds much faster.Change the source files to match what you need to test.It +#is OK to commit changes to the source file list. add_executable(mp-quick-test mp-tests.cpp halo.cpp @@ -66,7 +68,7 @@ add_executable(mp-quick-test-3-only mp-tests.cpp wide-halo-2d-3.cpp ) -# cmake-format: on +#cmake - format : on target_compile_definitions(mp-quick-test PRIVATE QUICK_TEST) target_compile_definitions(mp-quick-test-3-only PRIVATE QUICK_TEST) @@ -80,8 +82,8 @@ foreach(test-exec IN ITEMS mp-tests mp-tests-3 mp-quick-test mp-quick-test-3-onl "${CMAKE_COMMAND} -E time") endforeach() -# tests without --sycl flag will fail on IshmemBackend TODO: make them be -# running somehow if ENABLE_ISHMEM will be default CI config +#tests without-- sycl flag will fail on IshmemBackend TODO : make them be +#running somehow if ENABLE_ISHMEM will be default CI config if(NOT ENABLE_ISHMEM) add_mp_ctest(NAME mp-quick-test NPROC 1) add_mp_ctest(NAME mp-quick-test NPROC 2) @@ -90,7 +92,7 @@ if(NOT ENABLE_ISHMEM) cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL) if(NOT MPI_IMPL STREQUAL "openmpi") - # MPI_Win_create fails for communicator with size 1 +#MPI_Win_create fails for communicator with size 1 add_mp_ctest(NAME mp-tests NPROC 1 TIMEOUT 150) endif() foreach(nproc RANGE 2 4) @@ -100,15 +102,16 @@ if(NOT ENABLE_ISHMEM) endif() if(ENABLE_SYCL) - # DRA-83: Slide isn't complete +#DRA - 83 : Slide isn't complete set(sycl-exclusions *Slide*:ComplexSlide*:) if(ENABLE_ISHMEM) - # Some Halo3 cases don't work on IshmemBackend (work on MPI, see: DRA-84), - # Counted.large fails with - # distributed-ranges/test/gtest/common/counted.cpp:62: Failure Expected - # equality of these values: *(--counted_result.end()) Which is: 5, should be - # 77 Mdspan, Mdarray hangs sometimes on ISHMEM. +#Some Halo3 cases don't work on IshmemBackend (work on MPI, see: DRA-84), +#Counted.large fails with +#distributed - \ + ranges / test / gtest / common / counted.cpp : 62 : Failure Expected +#equality of these values : *(--counted_result.end())Which is : 5, should be +# 77 Mdspan, Mdarray hangs sometimes on ISHMEM. set(sycl-exclusions ${sycl-exclusions}Halo3/*:Sort*:Counted/*:Mdspan*:Mdarray*:) endif() diff --git a/test/gtest/mp/copy.cpp b/test/gtest/mp/copy.cpp index 66e54def9c..a54d194890 100644 --- a/test/gtest/mp/copy.cpp +++ b/test/gtest/mp/copy.cpp @@ -6,9 +6,7 @@ // Fixture -template class CopyMP : public testing::Test { -public: -}; +template class CopyMP : public testing::Test { public: }; TYPED_TEST_SUITE(CopyMP, AllTypes); diff --git a/test/gtest/mp/reduce.cpp b/test/gtest/mp/reduce.cpp index e663188bbd..4fb683138b 100644 --- a/test/gtest/mp/reduce.cpp +++ b/test/gtest/mp/reduce.cpp @@ -6,9 +6,7 @@ // Fixture -template class ReduceMP : public testing::Test { -public: -}; +template class ReduceMP : public testing::Test { public: }; TYPED_TEST_SUITE(ReduceMP, AllTypes); diff --git a/test/gtest/mp/segments.cpp b/test/gtest/mp/segments.cpp index 029a5faaa3..8817bed379 100644 --- a/test/gtest/mp/segments.cpp +++ b/test/gtest/mp/segments.cpp @@ -6,9 +6,7 @@ #include -template class Segmented : public testing::Test { -public: -}; +template class Segmented : public testing::Test { public: }; TYPED_TEST_SUITE(Segmented, AllTypesWithoutIshmem); @@ -27,9 +25,7 @@ TYPED_TEST(Segmented, Basic) { EXPECT_EQ(dr::ranges::segments(ops.dist_vec), segmented); } -template class SegmentUtils : public testing::Test { -public: -}; +template class SegmentUtils : public testing::Test { public: }; // traversing on host over local_segment does not work in case of both: // device_memory and IshmemBackend (which uses device memory) diff --git a/test/gtest/mp/xp-tests.hpp b/test/gtest/mp/xp-tests.hpp index d4c14fc0e5..4705b53baf 100644 --- a/test/gtest/mp/xp-tests.hpp +++ b/test/gtest/mp/xp-tests.hpp @@ -20,15 +20,15 @@ namespace xp = dr::mp; template concept compliant_view = rng::forward_range && rng::random_access_range && - rng::viewable_range && requires(V &v) { - // test one at a time so error is apparent - dr::ranges::segments(v); - dr::ranges::segments(v).begin(); - *dr::ranges::segments(v).begin(); - dr::ranges::rank(*dr::ranges::segments(v).begin()); - // dr::ranges::local(rng::begin(dr::ranges::segments(v)[0])); - // dr::mp::local_segments(v); - }; + rng::viewable_range && requires(V &v) { + // test one at a time so error is apparent + dr::ranges::segments(v); + dr::ranges::segments(v).begin(); + *dr::ranges::segments(v).begin(); + dr::ranges::rank(*dr::ranges::segments(v).begin()); + // dr::ranges::local(rng::begin(dr::ranges::segments(v)[0])); + // dr::mp::local_segments(v); +}; inline void barrier() { dr::mp::barrier(); } inline void fence() { dr::mp::fence(); } From 716d4e72f99315493558ceace35b5a822baac27d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ch=C4=99tkowski?= Date: Mon, 25 Nov 2024 21:33:19 +0100 Subject: [PATCH 12/19] Revert clang format "fix" for CMakeLists.txt --- test/gtest/mp/CMakeLists.txt | 41 +++++++++++++++++------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index 28b4eef43f..e845b128d1 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -1,11 +1,11 @@ -#SPDX - FileCopyrightText : Intel Corporation +# SPDX-FileCopyrightText: Intel Corporation # -#SPDX - License - Identifier : BSD - 3 - Clause +# SPDX-License-Identifier: BSD-3-Clause set(CMAKE_INCLUDE_CURRENT_DIR ON) -#tested with a variable number of ranks -#cmake - format : off +# tested with a variable number of ranks +# cmake-format: off add_executable( mp-tests mp-tests.cpp @@ -17,10 +17,10 @@ add_executable( ../common/drop.cpp ../common/enumerate.cpp ../common/equal.cpp -#../ common / exclusive_scan.cpp disabled due to deadlock - DRA - 213 +# ../common/exclusive_scan.cpp disabled due to deadlock - DRA-213 ../common/fill.cpp ../common/for_each.cpp -#../ common / inclusive_scan.cpp disabled due to deadlock - DRA - 213 +# ../common/inclusive_scan.cpp disabled due to deadlock - DRA-213 ../common/iota.cpp ../common/iota_view.cpp ../common/reduce.cpp @@ -55,11 +55,9 @@ add_executable( wide-halo-2d-3.cpp ) -#mp - quick - test and mp - quick - test - 3 - \ - only is for development.By reducing the number of source files, \ - it -#builds much faster.Change the source files to match what you need to test.It -#is OK to commit changes to the source file list. +# mp-quick-test and mp-quick-test-3-only is for development. By reducing the number of source files, it +# builds much faster. Change the source files to match what you need to test. It +# is OK to commit changes to the source file list. add_executable(mp-quick-test mp-tests.cpp halo.cpp @@ -68,7 +66,7 @@ add_executable(mp-quick-test-3-only mp-tests.cpp wide-halo-2d-3.cpp ) -#cmake - format : on +# cmake-format: on target_compile_definitions(mp-quick-test PRIVATE QUICK_TEST) target_compile_definitions(mp-quick-test-3-only PRIVATE QUICK_TEST) @@ -82,8 +80,8 @@ foreach(test-exec IN ITEMS mp-tests mp-tests-3 mp-quick-test mp-quick-test-3-onl "${CMAKE_COMMAND} -E time") endforeach() -#tests without-- sycl flag will fail on IshmemBackend TODO : make them be -#running somehow if ENABLE_ISHMEM will be default CI config +# tests without --sycl flag will fail on IshmemBackend TODO: make them be +# running somehow if ENABLE_ISHMEM will be default CI config if(NOT ENABLE_ISHMEM) add_mp_ctest(NAME mp-quick-test NPROC 1) add_mp_ctest(NAME mp-quick-test NPROC 2) @@ -92,7 +90,7 @@ if(NOT ENABLE_ISHMEM) cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL) if(NOT MPI_IMPL STREQUAL "openmpi") -#MPI_Win_create fails for communicator with size 1 + # MPI_Win_create fails for communicator with size 1 add_mp_ctest(NAME mp-tests NPROC 1 TIMEOUT 150) endif() foreach(nproc RANGE 2 4) @@ -102,16 +100,15 @@ if(NOT ENABLE_ISHMEM) endif() if(ENABLE_SYCL) -#DRA - 83 : Slide isn't complete + # DRA-83: Slide isn't complete set(sycl-exclusions *Slide*:ComplexSlide*:) if(ENABLE_ISHMEM) -#Some Halo3 cases don't work on IshmemBackend (work on MPI, see: DRA-84), -#Counted.large fails with -#distributed - \ - ranges / test / gtest / common / counted.cpp : 62 : Failure Expected -#equality of these values : *(--counted_result.end())Which is : 5, should be -# 77 Mdspan, Mdarray hangs sometimes on ISHMEM. + # Some Halo3 cases don't work on IshmemBackend (work on MPI, see: DRA-84), + # Counted.large fails with + # distributed-ranges/test/gtest/common/counted.cpp:62: Failure Expected + # equality of these values: *(--counted_result.end()) Which is: 5, should be + # 77 Mdspan, Mdarray hangs sometimes on ISHMEM. set(sycl-exclusions ${sycl-exclusions}Halo3/*:Sort*:Counted/*:Mdspan*:Mdarray*:) endif() From b89c1315c7a0c1e7dfb5057af9a63b8bc2def87a Mon Sep 17 00:00:00 2001 From: kc432959 Date: Sun, 8 Dec 2024 18:25:06 +0100 Subject: [PATCH 13/19] changes for my tests --- benchmarks/gbench/mp/CMakeLists.txt | 266 ++++++++++++++-------------- test/gtest/mp/CMakeLists.txt | 2 +- test/gtest/mp/halo-3.cpp | 8 + 3 files changed, 142 insertions(+), 134 deletions(-) diff --git a/benchmarks/gbench/mp/CMakeLists.txt b/benchmarks/gbench/mp/CMakeLists.txt index 0ad4fb096c..9fa6b1e587 100644 --- a/benchmarks/gbench/mp/CMakeLists.txt +++ b/benchmarks/gbench/mp/CMakeLists.txt @@ -1,133 +1,133 @@ -# SPDX-FileCopyrightText: Intel Corporation -# -# SPDX-License-Identifier: BSD-3-Clause - -set(CMAKE_INCLUDE_CURRENT_DIR ON) - -# cmake-format: off -add_executable( - mp-bench - mp-bench.cpp - ../common/distributed_vector.cpp - ../common/dot_product.cpp - ../common/inclusive_exclusive_scan.cpp - ../common/sort.cpp - ../common/stream.cpp - streammp.cpp - rooted.cpp - stencil_1d.cpp - stencil_2d.cpp - chunk.cpp - # mdspan.cpp - mpi.cpp) -# cmake-format: on - -# disabled with SYCL due to DRA-135 -if(NOT ENABLE_SYCL) - target_sources(mp-bench PRIVATE mdspan.cpp) -endif() - -if(ENABLE_SYCL) - target_sources(mp-bench PRIVATE fft3d.cpp) -endif() - -if(NOT ENABLE_CUDA) - # does not compile in CUDA because: black_scholes.cpp uses std::log - # shallow_water, wave_equation uses uses exp - target_sources(mp-bench PRIVATE ../common/black_scholes.cpp shallow_water.cpp - wave_equation.cpp) -endif() - -# mp-quick-bench is for development. By reducing the number of source files, it -# builds much faster. Change the source files to match what you need to test. It -# is OK to commit changes to the source file list. -add_executable(mp-quick-bench mp-bench.cpp ../common/distributed_vector.cpp) - -foreach(mp-bench-exec IN ITEMS mp-bench mp-quick-bench) - target_compile_definitions(${mp-bench-exec} PRIVATE BENCH_MP) - target_link_libraries(${mp-bench-exec} benchmark::benchmark cxxopts DR::mpi) - if(ENABLE_ISHMEM) - target_link_ishmem(${mp-bench-exec}) - endif() - if(ENABLE_SYCL) - target_link_libraries(${mp-bench-exec} MKL::MKL_DPCPP) - endif() -endforeach() - -if(ENABLE_SYCL) - # target_sources(mp-quick-bench PRIVATE fft3d.cpp) -endif() - -cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL) - -# debug mp-bench is too slow -if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT MPI_IMPL STREQUAL "openmpi") - # MPI_Win_create fails for communicator with size 1 30000 is minimum because - # of static column size for stencil2D disable DPL benchmarks because we get - # intermittent fails with: ONEAPI_DEVICE_SELECTOR=opencl:cpu mpirun -n 1 - # ./mp-bench --vector-size 30000 --rows 100 --columns 100 --check - add_mp_ctest( - NAME mp-bench TIMEOUT 200 TARGS --vector-size 30000 --rows 100 --columns - 100 --check --benchmark_filter=-FFT3D.*) - if(ENABLE_SYCL) - add_mp_ctest( - NAME mp-bench TIMEOUT 200 SYCL TARGS --vector-size 30000 --rows 100 - --columns 100 --check --benchmark_filter=-.*DPL.*) - endif() -endif() - -add_executable(wave_equation wave_equation.cpp) -target_link_libraries(wave_equation cxxopts DR::mpi) -target_compile_definitions(wave_equation PRIVATE STANDALONE_BENCHMARK) -add_mp_ctest(NAME wave_equation) -add_executable(wave_equation_wide wave_equation_wide.cpp) -target_link_libraries(wave_equation_wide cxxopts DR::mpi) -target_compile_definitions(wave_equation_wide PRIVATE STANDALONE_BENCHMARK) -add_mp_ctest(NAME wave_equation_wide) -# add_mp_ctest(TEST_NAME wave_equation_fused NAME wave_equation TARGS -f) # -# DRA-92 -if(ENABLE_SYCL) - add_mp_ctest( - TEST_NAME wave_equation-sycl NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL) - add_mp_ctest( - TEST_NAME wave_equation-sycl-benchmark NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL TARGS -t) - add_mp_ctest( - TEST_NAME wave_equation_fused-sycl NAME wave_equation TIMEOUT 1000 NPROC 2 SYCL TARGS -f) - add_mp_ctest( - TEST_NAME wave_equation_wide-sycl NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL) - foreach(redundancy RANGE 1 8) - add_mp_ctest( - TEST_NAME wave_equation_wide-sycl-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS -t 100 -r ${redundancy}) - endforeach() - add_mp_ctest( - TEST_NAME wave_equation_wide-sycl-gpu NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory) - foreach(redundancy RANGE 1 8) - add_mp_ctest( - TEST_NAME wave_equation_wide-sycl-gpu-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory -t 100 -r ${redundancy}) - endforeach() -endif() - -add_executable(shallow_water shallow_water.cpp) -target_link_libraries(shallow_water cxxopts DR::mpi) -target_compile_definitions(shallow_water PRIVATE STANDALONE_BENCHMARK) - -# issue DRA-23 add_mp_ctest(TEST_NAME shallow_water NAME shallow_water NPROC 1) -# add_mp_ctest( TEST_NAME shallow_water_fused NAME shallow_water NPROC 1 TARGS -# -f) - -if(ENABLE_SYCL) - if(CMAKE_BUILD_TYPE STREQUAL "Release") - # too long (or hangs?) in debug - - add_mp_ctest( - TEST_NAME shallow_water-sycl NAME shallow_water NPROC 2 SYCL) - add_mp_ctest( - TEST_NAME shallow_water_fused-sycl NAME shallow_water NPROC 2 SYCL TARGS - -f) - endif() - - add_executable(fft3d_mp fft3d.cpp) - target_link_libraries(fft3d_mp cxxopts DR::mpi MKL::MKL_DPCPP) - target_compile_definitions(fft3d_mp PRIVATE STANDALONE_BENCHMARK) - add_mp_ctest(TEST_NAME fft3d_mp NAME fft3d_mp NPROC 2) -endif() +## SPDX-FileCopyrightText: Intel Corporation +## +## SPDX-License-Identifier: BSD-3-Clause +# +#set(CMAKE_INCLUDE_CURRENT_DIR ON) +# +## cmake-format: off +#add_executable( +# mp-bench +# mp-bench.cpp +# ../common/distributed_vector.cpp +# ../common/dot_product.cpp +# ../common/inclusive_exclusive_scan.cpp +# ../common/sort.cpp +# ../common/stream.cpp +# streammp.cpp +# rooted.cpp +# stencil_1d.cpp +# stencil_2d.cpp +# chunk.cpp +# # mdspan.cpp +# mpi.cpp) +## cmake-format: on +# +## disabled with SYCL due to DRA-135 +#if(NOT ENABLE_SYCL) +# target_sources(mp-bench PRIVATE mdspan.cpp) +#endif() +# +#if(ENABLE_SYCL) +# target_sources(mp-bench PRIVATE fft3d.cpp) +#endif() +# +#if(NOT ENABLE_CUDA) +# # does not compile in CUDA because: black_scholes.cpp uses std::log +# # shallow_water, wave_equation uses uses exp +# target_sources(mp-bench PRIVATE ../common/black_scholes.cpp shallow_water.cpp +# wave_equation.cpp) +#endif() +# +## mp-quick-bench is for development. By reducing the number of source files, it +## builds much faster. Change the source files to match what you need to test. It +## is OK to commit changes to the source file list. +#add_executable(mp-quick-bench mp-bench.cpp ../common/distributed_vector.cpp) +# +#foreach(mp-bench-exec IN ITEMS mp-bench mp-quick-bench) +# target_compile_definitions(${mp-bench-exec} PRIVATE BENCH_MP) +# target_link_libraries(${mp-bench-exec} benchmark::benchmark cxxopts DR::mpi) +# if(ENABLE_ISHMEM) +# target_link_ishmem(${mp-bench-exec}) +# endif() +# if(ENABLE_SYCL) +# target_link_libraries(${mp-bench-exec} MKL::MKL_DPCPP) +# endif() +#endforeach() +# +#if(ENABLE_SYCL) +# # target_sources(mp-quick-bench PRIVATE fft3d.cpp) +#endif() +# +#cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL) +# +## debug mp-bench is too slow +#if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT MPI_IMPL STREQUAL "openmpi") +# # MPI_Win_create fails for communicator with size 1 30000 is minimum because +# # of static column size for stencil2D disable DPL benchmarks because we get +# # intermittent fails with: ONEAPI_DEVICE_SELECTOR=opencl:cpu mpirun -n 1 +# # ./mp-bench --vector-size 30000 --rows 100 --columns 100 --check +# add_mp_ctest( +# NAME mp-bench TIMEOUT 200 TARGS --vector-size 30000 --rows 100 --columns +# 100 --check --benchmark_filter=-FFT3D.*) +# if(ENABLE_SYCL) +# add_mp_ctest( +# NAME mp-bench TIMEOUT 200 SYCL TARGS --vector-size 30000 --rows 100 +# --columns 100 --check --benchmark_filter=-.*DPL.*) +# endif() +#endif() +# +#add_executable(wave_equation wave_equation.cpp) +#target_link_libraries(wave_equation cxxopts DR::mpi) +#target_compile_definitions(wave_equation PRIVATE STANDALONE_BENCHMARK) +#add_mp_ctest(NAME wave_equation) +#add_executable(wave_equation_wide wave_equation_wide.cpp) +#target_link_libraries(wave_equation_wide cxxopts DR::mpi) +#target_compile_definitions(wave_equation_wide PRIVATE STANDALONE_BENCHMARK) +#add_mp_ctest(NAME wave_equation_wide) +## add_mp_ctest(TEST_NAME wave_equation_fused NAME wave_equation TARGS -f) # +## DRA-92 +#if(ENABLE_SYCL) +# add_mp_ctest( +# TEST_NAME wave_equation-sycl NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL) +# add_mp_ctest( +# TEST_NAME wave_equation-sycl-benchmark NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL TARGS -t) +# add_mp_ctest( +# TEST_NAME wave_equation_fused-sycl NAME wave_equation TIMEOUT 1000 NPROC 2 SYCL TARGS -f) +# add_mp_ctest( +# TEST_NAME wave_equation_wide-sycl NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL) +# foreach(redundancy RANGE 1 8) +# add_mp_ctest( +# TEST_NAME wave_equation_wide-sycl-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS -t 100 -r ${redundancy}) +# endforeach() +# add_mp_ctest( +# TEST_NAME wave_equation_wide-sycl-gpu NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory) +# foreach(redundancy RANGE 1 8) +# add_mp_ctest( +# TEST_NAME wave_equation_wide-sycl-gpu-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory -t 100 -r ${redundancy}) +# endforeach() +#endif() +# +#add_executable(shallow_water shallow_water.cpp) +#target_link_libraries(shallow_water cxxopts DR::mpi) +#target_compile_definitions(shallow_water PRIVATE STANDALONE_BENCHMARK) +# +## issue DRA-23 add_mp_ctest(TEST_NAME shallow_water NAME shallow_water NPROC 1) +## add_mp_ctest( TEST_NAME shallow_water_fused NAME shallow_water NPROC 1 TARGS +## -f) +# +#if(ENABLE_SYCL) +# if(CMAKE_BUILD_TYPE STREQUAL "Release") +# # too long (or hangs?) in debug +# +# add_mp_ctest( +# TEST_NAME shallow_water-sycl NAME shallow_water NPROC 2 SYCL) +# add_mp_ctest( +# TEST_NAME shallow_water_fused-sycl NAME shallow_water NPROC 2 SYCL TARGS +# -f) +# endif() +# +# add_executable(fft3d_mp fft3d.cpp) +# target_link_libraries(fft3d_mp cxxopts DR::mpi MKL::MKL_DPCPP) +# target_compile_definitions(fft3d_mp PRIVATE STANDALONE_BENCHMARK) +# add_mp_ctest(TEST_NAME fft3d_mp NAME fft3d_mp NPROC 2) +#endif() diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index e845b128d1..f62f965c59 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -60,7 +60,7 @@ add_executable( # is OK to commit changes to the source file list. add_executable(mp-quick-test mp-tests.cpp - halo.cpp + halo-3.cpp ) add_executable(mp-quick-test-3-only mp-tests.cpp diff --git a/test/gtest/mp/halo-3.cpp b/test/gtest/mp/halo-3.cpp index b2131d430a..aa4127f397 100644 --- a/test/gtest/mp/halo-3.cpp +++ b/test/gtest/mp/halo-3.cpp @@ -18,19 +18,27 @@ TYPED_TEST(Halo3, halo_is_visible_after_exchange_not_earlier) { dv.halo().exchange(); fill(dv, 13); + std::cout << "switch to check values: \n"; switch (dr::mp::default_comm().rank()) { case 0: + std::cout << "dv[0] = " << *(dv.begin() + 0).local() << "\n"; EXPECT_EQ(*(dv.begin() + 0).local(), 13); + std::cout << "dv[1] = " << *(dv.begin() + 0).local() << "\n"; EXPECT_EQ(*(dv.begin() + 1).local(), 7); break; case 1: + std::cout << "dv[0] = " << *(dv.begin() + 0).local() << "\n"; EXPECT_EQ(*(dv.begin() + 0).local(), 7); + std::cout << "dv[1] = " << *(dv.begin() + 0).local() << "\n"; EXPECT_EQ(*(dv.begin() + 1).local(), 13); + std::cout << "dv[2] = " << *(dv.begin() + 0).local() << "\n"; EXPECT_EQ(*(dv.begin() + 2).local(), 7); break; case 2: EXPECT_EQ(*(dv.begin() + 1).local(), 7); + std::cout << "dv[1] = " << *(dv.begin() + 0).local() << "\n"; EXPECT_EQ(*(dv.begin() + 2).local(), 13); + std::cout << "dv[2] = " << *(dv.begin() + 0).local() << "\n"; break; } From a007da2a7a2a642670d3db3f03d4d1960e24077e Mon Sep 17 00:00:00 2001 From: kc432959 Date: Sun, 8 Dec 2024 21:23:19 +0100 Subject: [PATCH 14/19] More logs and bring back benchmark cmake --- benchmarks/gbench/mp/CMakeLists.txt | 266 ++++++++++---------- benchmarks/gbench/mp/wave_equation_wide.cpp | 10 + include/dr/mp/algorithms/for_each.hpp | 6 + 3 files changed, 149 insertions(+), 133 deletions(-) diff --git a/benchmarks/gbench/mp/CMakeLists.txt b/benchmarks/gbench/mp/CMakeLists.txt index 9fa6b1e587..0ad4fb096c 100644 --- a/benchmarks/gbench/mp/CMakeLists.txt +++ b/benchmarks/gbench/mp/CMakeLists.txt @@ -1,133 +1,133 @@ -## SPDX-FileCopyrightText: Intel Corporation -## -## SPDX-License-Identifier: BSD-3-Clause -# -#set(CMAKE_INCLUDE_CURRENT_DIR ON) -# -## cmake-format: off -#add_executable( -# mp-bench -# mp-bench.cpp -# ../common/distributed_vector.cpp -# ../common/dot_product.cpp -# ../common/inclusive_exclusive_scan.cpp -# ../common/sort.cpp -# ../common/stream.cpp -# streammp.cpp -# rooted.cpp -# stencil_1d.cpp -# stencil_2d.cpp -# chunk.cpp -# # mdspan.cpp -# mpi.cpp) -## cmake-format: on -# -## disabled with SYCL due to DRA-135 -#if(NOT ENABLE_SYCL) -# target_sources(mp-bench PRIVATE mdspan.cpp) -#endif() -# -#if(ENABLE_SYCL) -# target_sources(mp-bench PRIVATE fft3d.cpp) -#endif() -# -#if(NOT ENABLE_CUDA) -# # does not compile in CUDA because: black_scholes.cpp uses std::log -# # shallow_water, wave_equation uses uses exp -# target_sources(mp-bench PRIVATE ../common/black_scholes.cpp shallow_water.cpp -# wave_equation.cpp) -#endif() -# -## mp-quick-bench is for development. By reducing the number of source files, it -## builds much faster. Change the source files to match what you need to test. It -## is OK to commit changes to the source file list. -#add_executable(mp-quick-bench mp-bench.cpp ../common/distributed_vector.cpp) -# -#foreach(mp-bench-exec IN ITEMS mp-bench mp-quick-bench) -# target_compile_definitions(${mp-bench-exec} PRIVATE BENCH_MP) -# target_link_libraries(${mp-bench-exec} benchmark::benchmark cxxopts DR::mpi) -# if(ENABLE_ISHMEM) -# target_link_ishmem(${mp-bench-exec}) -# endif() -# if(ENABLE_SYCL) -# target_link_libraries(${mp-bench-exec} MKL::MKL_DPCPP) -# endif() -#endforeach() -# -#if(ENABLE_SYCL) -# # target_sources(mp-quick-bench PRIVATE fft3d.cpp) -#endif() -# -#cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL) -# -## debug mp-bench is too slow -#if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT MPI_IMPL STREQUAL "openmpi") -# # MPI_Win_create fails for communicator with size 1 30000 is minimum because -# # of static column size for stencil2D disable DPL benchmarks because we get -# # intermittent fails with: ONEAPI_DEVICE_SELECTOR=opencl:cpu mpirun -n 1 -# # ./mp-bench --vector-size 30000 --rows 100 --columns 100 --check -# add_mp_ctest( -# NAME mp-bench TIMEOUT 200 TARGS --vector-size 30000 --rows 100 --columns -# 100 --check --benchmark_filter=-FFT3D.*) -# if(ENABLE_SYCL) -# add_mp_ctest( -# NAME mp-bench TIMEOUT 200 SYCL TARGS --vector-size 30000 --rows 100 -# --columns 100 --check --benchmark_filter=-.*DPL.*) -# endif() -#endif() -# -#add_executable(wave_equation wave_equation.cpp) -#target_link_libraries(wave_equation cxxopts DR::mpi) -#target_compile_definitions(wave_equation PRIVATE STANDALONE_BENCHMARK) -#add_mp_ctest(NAME wave_equation) -#add_executable(wave_equation_wide wave_equation_wide.cpp) -#target_link_libraries(wave_equation_wide cxxopts DR::mpi) -#target_compile_definitions(wave_equation_wide PRIVATE STANDALONE_BENCHMARK) -#add_mp_ctest(NAME wave_equation_wide) -## add_mp_ctest(TEST_NAME wave_equation_fused NAME wave_equation TARGS -f) # -## DRA-92 -#if(ENABLE_SYCL) -# add_mp_ctest( -# TEST_NAME wave_equation-sycl NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL) -# add_mp_ctest( -# TEST_NAME wave_equation-sycl-benchmark NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL TARGS -t) -# add_mp_ctest( -# TEST_NAME wave_equation_fused-sycl NAME wave_equation TIMEOUT 1000 NPROC 2 SYCL TARGS -f) -# add_mp_ctest( -# TEST_NAME wave_equation_wide-sycl NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL) -# foreach(redundancy RANGE 1 8) -# add_mp_ctest( -# TEST_NAME wave_equation_wide-sycl-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS -t 100 -r ${redundancy}) -# endforeach() -# add_mp_ctest( -# TEST_NAME wave_equation_wide-sycl-gpu NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory) -# foreach(redundancy RANGE 1 8) -# add_mp_ctest( -# TEST_NAME wave_equation_wide-sycl-gpu-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory -t 100 -r ${redundancy}) -# endforeach() -#endif() -# -#add_executable(shallow_water shallow_water.cpp) -#target_link_libraries(shallow_water cxxopts DR::mpi) -#target_compile_definitions(shallow_water PRIVATE STANDALONE_BENCHMARK) -# -## issue DRA-23 add_mp_ctest(TEST_NAME shallow_water NAME shallow_water NPROC 1) -## add_mp_ctest( TEST_NAME shallow_water_fused NAME shallow_water NPROC 1 TARGS -## -f) -# -#if(ENABLE_SYCL) -# if(CMAKE_BUILD_TYPE STREQUAL "Release") -# # too long (or hangs?) in debug -# -# add_mp_ctest( -# TEST_NAME shallow_water-sycl NAME shallow_water NPROC 2 SYCL) -# add_mp_ctest( -# TEST_NAME shallow_water_fused-sycl NAME shallow_water NPROC 2 SYCL TARGS -# -f) -# endif() -# -# add_executable(fft3d_mp fft3d.cpp) -# target_link_libraries(fft3d_mp cxxopts DR::mpi MKL::MKL_DPCPP) -# target_compile_definitions(fft3d_mp PRIVATE STANDALONE_BENCHMARK) -# add_mp_ctest(TEST_NAME fft3d_mp NAME fft3d_mp NPROC 2) -#endif() +# SPDX-FileCopyrightText: Intel Corporation +# +# SPDX-License-Identifier: BSD-3-Clause + +set(CMAKE_INCLUDE_CURRENT_DIR ON) + +# cmake-format: off +add_executable( + mp-bench + mp-bench.cpp + ../common/distributed_vector.cpp + ../common/dot_product.cpp + ../common/inclusive_exclusive_scan.cpp + ../common/sort.cpp + ../common/stream.cpp + streammp.cpp + rooted.cpp + stencil_1d.cpp + stencil_2d.cpp + chunk.cpp + # mdspan.cpp + mpi.cpp) +# cmake-format: on + +# disabled with SYCL due to DRA-135 +if(NOT ENABLE_SYCL) + target_sources(mp-bench PRIVATE mdspan.cpp) +endif() + +if(ENABLE_SYCL) + target_sources(mp-bench PRIVATE fft3d.cpp) +endif() + +if(NOT ENABLE_CUDA) + # does not compile in CUDA because: black_scholes.cpp uses std::log + # shallow_water, wave_equation uses uses exp + target_sources(mp-bench PRIVATE ../common/black_scholes.cpp shallow_water.cpp + wave_equation.cpp) +endif() + +# mp-quick-bench is for development. By reducing the number of source files, it +# builds much faster. Change the source files to match what you need to test. It +# is OK to commit changes to the source file list. +add_executable(mp-quick-bench mp-bench.cpp ../common/distributed_vector.cpp) + +foreach(mp-bench-exec IN ITEMS mp-bench mp-quick-bench) + target_compile_definitions(${mp-bench-exec} PRIVATE BENCH_MP) + target_link_libraries(${mp-bench-exec} benchmark::benchmark cxxopts DR::mpi) + if(ENABLE_ISHMEM) + target_link_ishmem(${mp-bench-exec}) + endif() + if(ENABLE_SYCL) + target_link_libraries(${mp-bench-exec} MKL::MKL_DPCPP) + endif() +endforeach() + +if(ENABLE_SYCL) + # target_sources(mp-quick-bench PRIVATE fft3d.cpp) +endif() + +cmake_path(GET MPI_CXX_ADDITIONAL_INCLUDE_DIRS FILENAME MPI_IMPL) + +# debug mp-bench is too slow +if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT MPI_IMPL STREQUAL "openmpi") + # MPI_Win_create fails for communicator with size 1 30000 is minimum because + # of static column size for stencil2D disable DPL benchmarks because we get + # intermittent fails with: ONEAPI_DEVICE_SELECTOR=opencl:cpu mpirun -n 1 + # ./mp-bench --vector-size 30000 --rows 100 --columns 100 --check + add_mp_ctest( + NAME mp-bench TIMEOUT 200 TARGS --vector-size 30000 --rows 100 --columns + 100 --check --benchmark_filter=-FFT3D.*) + if(ENABLE_SYCL) + add_mp_ctest( + NAME mp-bench TIMEOUT 200 SYCL TARGS --vector-size 30000 --rows 100 + --columns 100 --check --benchmark_filter=-.*DPL.*) + endif() +endif() + +add_executable(wave_equation wave_equation.cpp) +target_link_libraries(wave_equation cxxopts DR::mpi) +target_compile_definitions(wave_equation PRIVATE STANDALONE_BENCHMARK) +add_mp_ctest(NAME wave_equation) +add_executable(wave_equation_wide wave_equation_wide.cpp) +target_link_libraries(wave_equation_wide cxxopts DR::mpi) +target_compile_definitions(wave_equation_wide PRIVATE STANDALONE_BENCHMARK) +add_mp_ctest(NAME wave_equation_wide) +# add_mp_ctest(TEST_NAME wave_equation_fused NAME wave_equation TARGS -f) # +# DRA-92 +if(ENABLE_SYCL) + add_mp_ctest( + TEST_NAME wave_equation-sycl NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL) + add_mp_ctest( + TEST_NAME wave_equation-sycl-benchmark NAME wave_equation TIMEOUT 1000 NPROC 8 SYCL TARGS -t) + add_mp_ctest( + TEST_NAME wave_equation_fused-sycl NAME wave_equation TIMEOUT 1000 NPROC 2 SYCL TARGS -f) + add_mp_ctest( + TEST_NAME wave_equation_wide-sycl NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL) + foreach(redundancy RANGE 1 8) + add_mp_ctest( + TEST_NAME wave_equation_wide-sycl-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS -t 100 -r ${redundancy}) + endforeach() + add_mp_ctest( + TEST_NAME wave_equation_wide-sycl-gpu NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory) + foreach(redundancy RANGE 1 8) + add_mp_ctest( + TEST_NAME wave_equation_wide-sycl-gpu-benchmark-${redundancy} NAME wave_equation_wide TIMEOUT 1000 NPROC 8 SYCL TARGS --device-memory -t 100 -r ${redundancy}) + endforeach() +endif() + +add_executable(shallow_water shallow_water.cpp) +target_link_libraries(shallow_water cxxopts DR::mpi) +target_compile_definitions(shallow_water PRIVATE STANDALONE_BENCHMARK) + +# issue DRA-23 add_mp_ctest(TEST_NAME shallow_water NAME shallow_water NPROC 1) +# add_mp_ctest( TEST_NAME shallow_water_fused NAME shallow_water NPROC 1 TARGS +# -f) + +if(ENABLE_SYCL) + if(CMAKE_BUILD_TYPE STREQUAL "Release") + # too long (or hangs?) in debug + + add_mp_ctest( + TEST_NAME shallow_water-sycl NAME shallow_water NPROC 2 SYCL) + add_mp_ctest( + TEST_NAME shallow_water_fused-sycl NAME shallow_water NPROC 2 SYCL TARGS + -f) + endif() + + add_executable(fft3d_mp fft3d.cpp) + target_link_libraries(fft3d_mp cxxopts DR::mpi MKL::MKL_DPCPP) + target_compile_definitions(fft3d_mp PRIVATE STANDALONE_BENCHMARK) + add_mp_ctest(TEST_NAME fft3d_mp NAME fft3d_mp NPROC 2) +endif() diff --git a/benchmarks/gbench/mp/wave_equation_wide.cpp b/benchmarks/gbench/mp/wave_equation_wide.cpp index b8fb28e083..51830d9deb 100644 --- a/benchmarks/gbench/mp/wave_equation_wide.cpp +++ b/benchmarks/gbench/mp/wave_equation_wide.cpp @@ -140,10 +140,13 @@ int run( std::cout << "Redundancy " << redundancy << std::endl; } + std::cout << "before e\n"; // state variables // water elevation at T points Array e({nx + 1, ny}, dist); + std::cout << "after e\n"; dr::mp::fill(e, 0.0); + std::cout << "after fill e\n"; // x velocity at U points Array u({nx + 1, ny}, dist); dr::mp::fill(u, 0.0); @@ -165,12 +168,17 @@ int run( Array dudt({nx + 1, ny}, dist); Array dvdt({nx + 1, ny + 1}, dist); + std::cout << "After all arrays\n"; + dr::mp::fill(dedt, 0); dr::mp::fill(dudt, 0); dr::mp::fill(dvdt, 0); + std::cout << "After fill\n"; + dr::mp::halo(dedt).exchange(); dr::mp::halo(dudt).exchange(); dr::mp::halo(dvdt).exchange(); + std::cout << "After first exchange\n"; auto init_op = [xmin, ymin, grid](auto index, auto v) { auto &[o] = v; @@ -184,6 +192,7 @@ int run( } }; dr::mp::for_each(init_op, e); + std::cout << "After mp::for_each\n"; auto add = [](auto ops) { return ops.first + ops.second; }; auto max = [](double x, double y) { return std::max(x, y); }; @@ -225,6 +234,7 @@ int run( }; for (std::size_t i = 0; i < nt + 1; i++) { + std::cout << "i = " << i << "\n"; t = static_cast(i) * dt; if (t >= next_t_export - 1e-8) { diff --git a/include/dr/mp/algorithms/for_each.hpp b/include/dr/mp/algorithms/for_each.hpp index 2b841be43b..13a657771c 100644 --- a/include/dr/mp/algorithms/for_each.hpp +++ b/include/dr/mp/algorithms/for_each.hpp @@ -97,13 +97,16 @@ void stencil_for_each_extended_1(auto op, stencil_index_type<1> begin, }; if (mp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION + std::cout << "do parallel_for 1d - sycl\n"; dr::__detail::parallel_for(dr::mp::sycl_queue(), sycl::range<1>(distance[0]), do_point) .wait(); #else + std::cout << "do parallel_for 1d - sycl failed\n"; assert(false); #endif } else { + std::cout << "do parallel_for 1d - no sycl\n"; for (std::size_t i = 0; i < distance[0]; i++) { do_point(i); } @@ -147,14 +150,17 @@ void stencil_for_each_extended_2(auto op, stencil_index_type<2> &begin, }; if (mp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION + std::cout << "do parallel_for 2d - sycl\n"; dr::__detail::parallel_for(dr::mp::sycl_queue(), sycl::range<2>(distance[0], distance[1]), do_point) .wait(); #else + std::cout << "do parallel_for 2d - sycl fail\n"; assert(false); #endif } else { + std::cout << "do parallel_for 2d - no sycl\n"; for (std::size_t i = 0; i < distance[0]; i++) { for (std::size_t j = 0; j < distance[1]; j++) { do_point(stencil_index_type<2>{i, j}); From 3d410d204549f1da0ac7a41a0569324e4b0ec226 Mon Sep 17 00:00:00 2001 From: kc432959 Date: Sun, 22 Dec 2024 21:14:02 +0100 Subject: [PATCH 15/19] Implemented game_of_life --- benchmarks/gbench/mp/CMakeLists.txt | 4 + benchmarks/gbench/mp/game_of_life.cpp | 233 ++++++++++++++++++++++++++ include/dr/mp/algorithms/for_each.hpp | 6 - 3 files changed, 237 insertions(+), 6 deletions(-) create mode 100644 benchmarks/gbench/mp/game_of_life.cpp diff --git a/benchmarks/gbench/mp/CMakeLists.txt b/benchmarks/gbench/mp/CMakeLists.txt index 0ad4fb096c..a9f2452ba0 100644 --- a/benchmarks/gbench/mp/CMakeLists.txt +++ b/benchmarks/gbench/mp/CMakeLists.txt @@ -107,6 +107,10 @@ if(ENABLE_SYCL) endforeach() endif() +add_executable(game_of_life game_of_life.cpp) +target_link_libraries(game_of_life cxxopts DR::mpi) +target_compile_definitions(game_of_life PRIVATE STANDALONE_BENCHMARK) + add_executable(shallow_water shallow_water.cpp) target_link_libraries(shallow_water cxxopts DR::mpi) target_compile_definitions(shallow_water PRIVATE STANDALONE_BENCHMARK) diff --git a/benchmarks/gbench/mp/game_of_life.cpp b/benchmarks/gbench/mp/game_of_life.cpp new file mode 100644 index 0000000000..46f323d779 --- /dev/null +++ b/benchmarks/gbench/mp/game_of_life.cpp @@ -0,0 +1,233 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "cxxopts.hpp" +#include "dr/mp.hpp" +#include "mpi.h" +#include +#include +#include + +#ifdef STANDALONE_BENCHMARK + +MPI_Comm comm; +int comm_rank; +int comm_size; + +#else + +#include "../common/dr_bench.hpp" + +#endif + +namespace GameOfLife { + +using T = int; +using Array = dr::mp::distributed_mdarray; + +void init(std::size_t n, Array& out) { + std::vector> in(n, std::vector(n, 0)); + /* + 1 0 0 + 0 1 1 + 1 1 0 + */ + // clang-format off + in[1][1] = 1; in[1][2] = 0; in[1][3] = 0; + in[2][1] = 0; in[2][2] = 1; in[2][3] = 1; + in[3][1] = 1; in[3][2] = 1; in[3][3] = 0; + // clang-format on + std::vector local(n * n); + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + local[i * n + j] = in[i][j]; + } + } + dr::mp::copy(local.begin(), local.end(), out.begin()); +} + +void run(std::size_t n, std::size_t redundancy, std::size_t steps, bool debug) { + if (comm_rank == 0) { + std::cout << "Using backend: dr" << std::endl; + std::cout << "Grid size: " << n << " x " << n << std::endl; + std::cout << "Time steps:" << steps << std::endl; + std::cout << "Redundancy " << redundancy << std::endl; + std::cout << std::endl; + } + + // construct grid + auto dist = dr::mp::distribution().halo(1).redundancy(redundancy); + Array array({n, n}, dist); + Array array_out({n, n}, dist); + dr::mp::fill(array, 0); + dr::mp::fill(array_out, 0); + + init(n, array); + + // execute one calculation for one cell in game of life + auto calculate = [](auto stencils) { + auto [x, x_out] = stencils; + // because below we calculate the sum of all 9 cells, + // but we want the output only of 8 neighbourhs, subtract the value of self. + int live_neighbours = -x(0, 0); + for (int i = -1; i <= 1; i++) { + for (int j = -1; j <= 1; j++) { + live_neighbours += x(i, j); // alive == 1, dead == 0, so simple addition works + } + } + + if (x(0, 0) == 1) { // self if alive + if (live_neighbours == 2 || live_neighbours == 3) { + x_out(0, 0) = 1; + } else { + x_out(0, 0) = 0; + } + } + else { // self is dead + if (live_neighbours == 3) { + x_out(0, 0) = 1; + } else { + x_out(0, 0) = 0; + } + } + }; + + // assign values of second array to first array + auto assign = [](auto stencils) { + auto [x, x_out] = stencils; + x(0, 0) = x_out(0, 0); + }; + + auto tic = std::chrono::steady_clock::now(); + + auto print = [n](const auto &v) { + std::vector local(n * n); + copy(v, local.begin()); + if (comm_rank == 0) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + std::cout << local[i * n + j] << " "; + } + std::cout << "\n"; + } + } + }; + + for (std::size_t i = 0; i < steps; i++) { + if (comm_rank == 0) { + std::cout << "Step " << i << "\n"; + } + // step + stencil_for_each_extended<2>(calculate, {1, 1}, {1, 1}, array, array_out); + stencil_for_each_extended<2>(assign, {0, 0}, {0, 0}, array, array_out); + // phase with communication - once after (redundancy - 1) steps without communication + if ((i + 1) % redundancy == 0) { + if (comm_rank == 0) { + std::cout << "Exchange\n"; + } + array.halo().exchange(); + // Array_out is a temporary, no need to exchange it + } + if (debug) { + if (comm_rank == 0) { + std::cout << "Array " << i << ":\n"; + } + print(array); + if (comm_rank == 0) { + std::cout << "\n"; + } + } + } + + auto toc = std::chrono::steady_clock::now(); + std::chrono::duration duration = toc - tic; + if (comm_rank == 0) { + double t_cpu = duration.count(); + double t_step = t_cpu / static_cast(steps); + std::cout << "Duration: " << std::setprecision(3) << t_cpu << " s" << std::endl; + std::cout << "Time per step: " << std::setprecision(2) << t_step * 1000 << " ms" << std::endl; + } +} + +} // namespace GameOfLife + +#ifdef STANDALONE_BENCHMARK + +int main(int argc, char *argv[]) { + + MPI_Init(&argc, &argv); + comm = MPI_COMM_WORLD; + MPI_Comm_rank(comm, &comm_rank); + MPI_Comm_size(comm, &comm_size); + + cxxopts::Options options_spec(argv[0], "game of life"); + // clang-format off + options_spec.add_options() + ("n,size", "Grid size", cxxopts::value()->default_value("128")) + ("t,steps", "Run a fixed number of time steps.", cxxopts::value()->default_value("100")) + ("r,redundancy", "Set outer-grid redundancy parameter.", cxxopts::value()->default_value("2")) + ("sycl", "Execute on SYCL device") + ("l,log", "enable logging") + ("d,debug", "enable debug logging") + ("logprefix", "appended .RANK.log", cxxopts::value()->default_value("dr")) + ("device-memory", "Use device memory") + ("h,help", "Print help"); + // clang-format on + + cxxopts::ParseResult options; + try { + options = options_spec.parse(argc, argv); + } catch (const cxxopts::OptionParseException &e) { + std::cout << options_spec.help() << "\n"; + exit(1); + } + + std::unique_ptr logfile; + if (options.count("log")) { + logfile = + std::make_unique(options["logprefix"].as() + + fmt::format(".{}.log", comm_rank)); + dr::drlog.set_file(*logfile); + } + + if (options.count("sycl")) { +#ifdef SYCL_LANGUAGE_VERSION + sycl::queue q = dr::mp::select_queue(); + std::cout << "Run on: " + << q.get_device().get_info() << "\n"; + dr::mp::init(q, options.count("device-memory") ? sycl::usm::alloc::device + : sycl::usm::alloc::shared); +#else + std::cout << "Sycl support requires icpx\n"; + exit(1); +#endif + } else { + if (comm_rank == 0) { + std::cout << "Run on: CPU\n"; + } + dr::mp::init(); + } + + std::size_t n = options["n"].as(); + std::size_t redundancy = options["r"].as(); + std::size_t steps = options["t"].as(); + + bool debug = false; + if (options.count("debug")) { + debug = true; + } + + GameOfLife::run(n, redundancy, steps, debug); + dr::mp::finalize(); + MPI_Finalize(); + return 0; +} + +#else + +static void GameOfLife_DR(benchmark::State &state) {} + +DR_BENCHMARK(GameOfLife_DR); + +#endif diff --git a/include/dr/mp/algorithms/for_each.hpp b/include/dr/mp/algorithms/for_each.hpp index 13a657771c..2b841be43b 100644 --- a/include/dr/mp/algorithms/for_each.hpp +++ b/include/dr/mp/algorithms/for_each.hpp @@ -97,16 +97,13 @@ void stencil_for_each_extended_1(auto op, stencil_index_type<1> begin, }; if (mp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION - std::cout << "do parallel_for 1d - sycl\n"; dr::__detail::parallel_for(dr::mp::sycl_queue(), sycl::range<1>(distance[0]), do_point) .wait(); #else - std::cout << "do parallel_for 1d - sycl failed\n"; assert(false); #endif } else { - std::cout << "do parallel_for 1d - no sycl\n"; for (std::size_t i = 0; i < distance[0]; i++) { do_point(i); } @@ -150,17 +147,14 @@ void stencil_for_each_extended_2(auto op, stencil_index_type<2> &begin, }; if (mp::use_sycl()) { #ifdef SYCL_LANGUAGE_VERSION - std::cout << "do parallel_for 2d - sycl\n"; dr::__detail::parallel_for(dr::mp::sycl_queue(), sycl::range<2>(distance[0], distance[1]), do_point) .wait(); #else - std::cout << "do parallel_for 2d - sycl fail\n"; assert(false); #endif } else { - std::cout << "do parallel_for 2d - no sycl\n"; for (std::size_t i = 0; i < distance[0]; i++) { for (std::size_t j = 0; j < distance[1]; j++) { do_point(stencil_index_type<2>{i, j}); From 287fcf7b36be39dc51962d7fbdea2d76d0ebba33 Mon Sep 17 00:00:00 2001 From: kc432959 Date: Mon, 23 Dec 2024 10:14:31 +0100 Subject: [PATCH 16/19] Added halo_2d exchange test --- test/gtest/mp/CMakeLists.txt | 3 ++- test/gtest/mp/halo-2d.cpp | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 test/gtest/mp/halo-2d.cpp diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index f62f965c59..74e5cd84e6 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -37,6 +37,7 @@ add_executable( copy.cpp distributed_vector.cpp halo.cpp + halo-2d.cpp mdstar.cpp mpsort.cpp reduce.cpp @@ -60,7 +61,7 @@ add_executable( # is OK to commit changes to the source file list. add_executable(mp-quick-test mp-tests.cpp - halo-3.cpp + halo-2d.cpp ) add_executable(mp-quick-test-3-only mp-tests.cpp diff --git a/test/gtest/mp/halo-2d.cpp b/test/gtest/mp/halo-2d.cpp new file mode 100644 index 0000000000..5168bd6511 --- /dev/null +++ b/test/gtest/mp/halo-2d.cpp @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: Intel Corporation +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "xp-tests.hpp" + +TEST(Halo2D, exchange_2d_test) { + dr::mp::distributed_mdarray dv({10, 10}, dr::mp::distribution().halo(1)); + + DRLOG("exchange start"); + dv.halo().exchange(); + DRLOG("exchange end"); +} From 00e1320127f1a9815a16e280501dff189c7cce33 Mon Sep 17 00:00:00 2001 From: kc432959 Date: Mon, 23 Dec 2024 17:09:42 +0100 Subject: [PATCH 17/19] Fixed game of life --- benchmarks/gbench/mp/CMakeLists.txt | 4 - test/gtest/mp/CMakeLists.txt | 7 + .../gbench => test/gtest}/mp/game_of_life.cpp | 199 +++++++++++------- 3 files changed, 125 insertions(+), 85 deletions(-) rename {benchmarks/gbench => test/gtest}/mp/game_of_life.cpp (55%) diff --git a/benchmarks/gbench/mp/CMakeLists.txt b/benchmarks/gbench/mp/CMakeLists.txt index a9f2452ba0..0ad4fb096c 100644 --- a/benchmarks/gbench/mp/CMakeLists.txt +++ b/benchmarks/gbench/mp/CMakeLists.txt @@ -107,10 +107,6 @@ if(ENABLE_SYCL) endforeach() endif() -add_executable(game_of_life game_of_life.cpp) -target_link_libraries(game_of_life cxxopts DR::mpi) -target_compile_definitions(game_of_life PRIVATE STANDALONE_BENCHMARK) - add_executable(shallow_water shallow_water.cpp) target_link_libraries(shallow_water cxxopts DR::mpi) target_compile_definitions(shallow_water PRIVATE STANDALONE_BENCHMARK) diff --git a/test/gtest/mp/CMakeLists.txt b/test/gtest/mp/CMakeLists.txt index 74e5cd84e6..175915a87d 100644 --- a/test/gtest/mp/CMakeLists.txt +++ b/test/gtest/mp/CMakeLists.txt @@ -81,6 +81,13 @@ foreach(test-exec IN ITEMS mp-tests mp-tests-3 mp-quick-test mp-quick-test-3-onl "${CMAKE_COMMAND} -E time") endforeach() +# Game of life +add_executable(game_of_life game_of_life.cpp) +if(ENABLE_ISHMEM) + target_link_ishmem(game_of_life) +endif() +target_link_libraries(game_of_life cxxopts DR::mpi) + # tests without --sycl flag will fail on IshmemBackend TODO: make them be # running somehow if ENABLE_ISHMEM will be default CI config if(NOT ENABLE_ISHMEM) diff --git a/benchmarks/gbench/mp/game_of_life.cpp b/test/gtest/mp/game_of_life.cpp similarity index 55% rename from benchmarks/gbench/mp/game_of_life.cpp rename to test/gtest/mp/game_of_life.cpp index 46f323d779..036df65376 100644 --- a/benchmarks/gbench/mp/game_of_life.cpp +++ b/test/gtest/mp/game_of_life.cpp @@ -5,21 +5,40 @@ #include "cxxopts.hpp" #include "dr/mp.hpp" #include "mpi.h" + +inline void barrier() { dr::mp::barrier(); } +inline void fence() { dr::mp::fence(); } +inline void fence_on(auto &&obj) { obj.fence(); } + #include #include #include -#ifdef STANDALONE_BENCHMARK +// + +struct MPI_data { + MPI_Comm comm; + int rank; + int size; -MPI_Comm comm; -int comm_rank; -int comm_size; + bool host() { + return rank == 0; + } +}; -#else +static MPI_data mpi_data; -#include "../common/dr_bench.hpp" +struct Options { + std::size_t size; + std::size_t steps; + std::size_t redundancy; + bool debug; -#endif + std::unique_ptr logfile; + + bool sycl; + bool device_memory; +}; namespace GameOfLife { @@ -27,7 +46,7 @@ using T = int; using Array = dr::mp::distributed_mdarray; void init(std::size_t n, Array& out) { - std::vector> in(n, std::vector(n, 0)); + std::vector> in(4, std::vector(4, 0)); /* 1 0 0 0 1 1 @@ -39,8 +58,8 @@ void init(std::size_t n, Array& out) { in[3][1] = 1; in[3][2] = 1; in[3][3] = 0; // clang-format on std::vector local(n * n); - for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { local[i * n + j] = in[i][j]; } } @@ -48,7 +67,7 @@ void init(std::size_t n, Array& out) { } void run(std::size_t n, std::size_t redundancy, std::size_t steps, bool debug) { - if (comm_rank == 0) { + if (mpi_data.host()) { std::cout << "Using backend: dr" << std::endl; std::cout << "Grid size: " << n << " x " << n << std::endl; std::cout << "Time steps:" << steps << std::endl; @@ -61,7 +80,6 @@ void run(std::size_t n, std::size_t redundancy, std::size_t steps, bool debug) { Array array({n, n}, dist); Array array_out({n, n}, dist); dr::mp::fill(array, 0); - dr::mp::fill(array_out, 0); init(n, array); @@ -99,80 +117,93 @@ void run(std::size_t n, std::size_t redundancy, std::size_t steps, bool debug) { x(0, 0) = x_out(0, 0); }; - auto tic = std::chrono::steady_clock::now(); - auto print = [n](const auto &v) { std::vector local(n * n); - copy(v, local.begin()); - if (comm_rank == 0) { + dr::mp::copy(0, v, local.begin()); + if (mpi_data.host()) { for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { - std::cout << local[i * n + j] << " "; + fmt::print("{}", local[i * n + j] == 1 ? '#' : '.'); } - std::cout << "\n"; + fmt::print("\n"); } } }; - for (std::size_t i = 0; i < steps; i++) { - if (comm_rank == 0) { - std::cout << "Step " << i << "\n"; + auto tic = std::chrono::steady_clock::now(); + for (std::size_t i = 0, next_treshold = 0; i < steps; i++) { + if (i >= next_treshold && mpi_data.host()) { + next_treshold += round(static_cast(steps / 100)); + double percent = round(static_cast(i) * 100 / static_cast(steps)); + fmt::print("Steps done {}% ({} of {} steps)\n", percent, i, steps); } + // step stencil_for_each_extended<2>(calculate, {1, 1}, {1, 1}, array, array_out); stencil_for_each_extended<2>(assign, {0, 0}, {0, 0}, array, array_out); + // phase with communication - once after (redundancy - 1) steps without communication if ((i + 1) % redundancy == 0) { - if (comm_rank == 0) { - std::cout << "Exchange\n"; + if (debug && mpi_data.host()) { + fmt::print("Exchange at step {}\n", i); } array.halo().exchange(); // Array_out is a temporary, no need to exchange it } + + // debug print if (debug) { - if (comm_rank == 0) { - std::cout << "Array " << i << ":\n"; + if (mpi_data.host()) { + fmt::print("Array {}:\n", i); } + // print needs a synchronication accros MPI boundary (dr::mp::copy), each node has to execute it print(array); - if (comm_rank == 0) { - std::cout << "\n"; - } } } - auto toc = std::chrono::steady_clock::now(); + std::chrono::duration duration = toc - tic; - if (comm_rank == 0) { + + if (mpi_data.host()) { double t_cpu = duration.count(); double t_step = t_cpu / static_cast(steps); - std::cout << "Duration: " << std::setprecision(3) << t_cpu << " s" << std::endl; - std::cout << "Time per step: " << std::setprecision(2) << t_step * 1000 << " ms" << std::endl; + + fmt::print("Steps done 100% ({} of {} steps)\n", steps, steps); + fmt::print("Duration {} s\n", t_cpu); + fmt::print("Time per step {} ms\n", t_step * 1000); } } } // namespace GameOfLife -#ifdef STANDALONE_BENCHMARK - -int main(int argc, char *argv[]) { +// Initialization functions +void init_MPI(int argc, char *argv[]) { MPI_Init(&argc, &argv); - comm = MPI_COMM_WORLD; - MPI_Comm_rank(comm, &comm_rank); - MPI_Comm_size(comm, &comm_size); + mpi_data.comm = MPI_COMM_WORLD; + MPI_Comm_rank(mpi_data.comm, &mpi_data.rank); + MPI_Comm_size(mpi_data.comm, &mpi_data.size); + + dr::drlog.debug("MPI: rank = {}, size = {}\n", mpi_data.rank, mpi_data.size); +} + +Options parse_options(int argc, char *argv[]) { + Options out; cxxopts::Options options_spec(argv[0], "game of life"); + // clang-format off options_spec.add_options() - ("n,size", "Grid size", cxxopts::value()->default_value("128")) - ("t,steps", "Run a fixed number of time steps.", cxxopts::value()->default_value("100")) - ("r,redundancy", "Set outer-grid redundancy parameter.", cxxopts::value()->default_value("2")) - ("sycl", "Execute on SYCL device") - ("l,log", "enable logging") - ("d,debug", "enable debug logging") + ("drhelp", "Print help") + ("log", "Enable logging") ("logprefix", "appended .RANK.log", cxxopts::value()->default_value("dr")) + ("log-filter", "Filter the log", cxxopts::value>()) ("device-memory", "Use device memory") - ("h,help", "Print help"); + ("sycl", "Execute on SYCL device") + ("d,debug", "enable debug logging") + ("n,size", "Grid size", cxxopts::value()->default_value("128")) + ("t,steps", "Run a fixed number of time steps.", cxxopts::value()->default_value("100")) + ("r,redundancy", "Set outer-grid redundancy parameter.", cxxopts::value()->default_value("2")); // clang-format on cxxopts::ParseResult options; @@ -183,51 +214,57 @@ int main(int argc, char *argv[]) { exit(1); } - std::unique_ptr logfile; - if (options.count("log")) { - logfile = - std::make_unique(options["logprefix"].as() + - fmt::format(".{}.log", comm_rank)); - dr::drlog.set_file(*logfile); + out.sycl = options.count("sycl") != 0; + out.device_memory = options.count("debug") != 0; + + if (options.count("drhelp")) { + std::cout << options_spec.help() << "\n"; + exit(0); } - if (options.count("sycl")) { -#ifdef SYCL_LANGUAGE_VERSION - sycl::queue q = dr::mp::select_queue(); - std::cout << "Run on: " - << q.get_device().get_info() << "\n"; - dr::mp::init(q, options.count("device-memory") ? sycl::usm::alloc::device - : sycl::usm::alloc::shared); -#else - std::cout << "Sycl support requires icpx\n"; - exit(1); -#endif - } else { - if (comm_rank == 0) { - std::cout << "Run on: CPU\n"; + if (options.count("log")) { + out.logfile.reset(new std::ofstream(options["logprefix"].as() + + fmt::format(".{}.log", mpi_data.rank))); + dr::drlog.set_file(*out.logfile); + if (options.count("log-filter")) { + dr::drlog.filter(options["log-filter"].as>()); } - dr::mp::init(); } - std::size_t n = options["n"].as(); - std::size_t redundancy = options["r"].as(); - std::size_t steps = options["t"].as(); + out.size = options["n"].as(); + out.redundancy = options["r"].as(); + out.steps = options["t"].as(); - bool debug = false; - if (options.count("debug")) { - debug = true; - } + out.debug = options.count("debug") != 0; - GameOfLife::run(n, redundancy, steps, debug); - dr::mp::finalize(); - MPI_Finalize(); - return 0; + return out; +} + +void dr_init(const Options& options) { +#ifdef SYCL_LANGUAGE_VERSION + if (options.sycl) { + sycl::queue q; + fmt::print("Running on sycl device: {}, memory: {}\n", q.get_device().get_info(), options.device_memory ? "devive" : "shared"); + dr::mp::init(q, options.device_memory ? sycl::usm::alloc::device + : sycl::usm::alloc::shared); + return; + } +#endif + fmt::print("Running on CPU\n"); + dr::mp::init(); } -#else +// Main loop -static void GameOfLife_DR(benchmark::State &state) {} +int main(int argc, char *argv[]) { + init_MPI(argc, argv); + Options options = parse_options(argc, argv); + dr_init(options); -DR_BENCHMARK(GameOfLife_DR); + GameOfLife::run(options.size, options.redundancy, options.steps, options.debug); -#endif + dr::mp::finalize(); + MPI_Finalize(); + + return 0; +} From b064a07cea2af7dca4af51552916037a07f9d787 Mon Sep 17 00:00:00 2001 From: kc432959 Date: Thu, 9 Jan 2025 21:58:09 +0100 Subject: [PATCH 18/19] Allow for nxm grids in game_of_life, add measurement of exchange step --- test/gtest/mp/game_of_life.cpp | 48 +++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/test/gtest/mp/game_of_life.cpp b/test/gtest/mp/game_of_life.cpp index 036df65376..446338c0b9 100644 --- a/test/gtest/mp/game_of_life.cpp +++ b/test/gtest/mp/game_of_life.cpp @@ -14,8 +14,6 @@ inline void fence_on(auto &&obj) { obj.fence(); } #include #include -// - struct MPI_data { MPI_Comm comm; int rank; @@ -29,7 +27,8 @@ struct MPI_data { static MPI_data mpi_data; struct Options { - std::size_t size; + std::size_t width; + std::size_t height; std::size_t steps; std::size_t redundancy; bool debug; @@ -57,7 +56,7 @@ void init(std::size_t n, Array& out) { in[2][1] = 0; in[2][2] = 1; in[2][3] = 1; in[3][1] = 1; in[3][2] = 1; in[3][3] = 0; // clang-format on - std::vector local(n * n); + std::vector local(n * 4); for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { local[i * n + j] = in[i][j]; @@ -66,10 +65,10 @@ void init(std::size_t n, Array& out) { dr::mp::copy(local.begin(), local.end(), out.begin()); } -void run(std::size_t n, std::size_t redundancy, std::size_t steps, bool debug) { +void run(std::size_t n, std::size_t m, std::size_t redundancy, std::size_t steps, bool debug) { if (mpi_data.host()) { std::cout << "Using backend: dr" << std::endl; - std::cout << "Grid size: " << n << " x " << n << std::endl; + std::cout << "Grid size: " << n << " x " << m << std::endl; std::cout << "Time steps:" << steps << std::endl; std::cout << "Redundancy " << redundancy << std::endl; std::cout << std::endl; @@ -77,11 +76,11 @@ void run(std::size_t n, std::size_t redundancy, std::size_t steps, bool debug) { // construct grid auto dist = dr::mp::distribution().halo(1).redundancy(redundancy); - Array array({n, n}, dist); - Array array_out({n, n}, dist); + Array array({n, m}, dist); + Array array_out({n, m}, dist); dr::mp::fill(array, 0); - init(n, array); + init(m, array); // execute one calculation for one cell in game of life auto calculate = [](auto stencils) { @@ -117,19 +116,22 @@ void run(std::size_t n, std::size_t redundancy, std::size_t steps, bool debug) { x(0, 0) = x_out(0, 0); }; - auto print = [n](const auto &v) { + auto print = [n, m](const auto &v) { std::vector local(n * n); dr::mp::copy(0, v, local.begin()); if (mpi_data.host()) { for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { - fmt::print("{}", local[i * n + j] == 1 ? '#' : '.'); + for (int j = 0; j < m; j++) { + fmt::print("{}", local[i * m + j] == 1 ? '#' : '.'); } fmt::print("\n"); } } }; + std::chrono::duration exchange_duration; + std::size_t exchange_count = 0; + auto tic = std::chrono::steady_clock::now(); for (std::size_t i = 0, next_treshold = 0; i < steps; i++) { if (i >= next_treshold && mpi_data.host()) { @@ -147,7 +149,12 @@ void run(std::size_t n, std::size_t redundancy, std::size_t steps, bool debug) { if (debug && mpi_data.host()) { fmt::print("Exchange at step {}\n", i); } + auto exchange_tic = std::chrono::steady_clock::now(); array.halo().exchange(); + auto exchange_toc = std::chrono::steady_clock::now(); + exchange_duration += exchange_toc - exchange_tic; + exchange_count++; + // Array_out is a temporary, no need to exchange it } @@ -166,11 +173,14 @@ void run(std::size_t n, std::size_t redundancy, std::size_t steps, bool debug) { if (mpi_data.host()) { double t_cpu = duration.count(); + double t_exch = exchange_duration.count(); double t_step = t_cpu / static_cast(steps); + double t_exch_step = t_exch / static_cast(exchange_count); fmt::print("Steps done 100% ({} of {} steps)\n", steps, steps); - fmt::print("Duration {} s\n", t_cpu); + fmt::print("Duration {} s, including exchange total time {} s\n", t_cpu, t_exch); fmt::print("Time per step {} ms\n", t_step * 1000); + fmt::print("Time per exchange {} ms\n", t_exch_step * 1000); } } @@ -201,7 +211,8 @@ Options parse_options(int argc, char *argv[]) { ("device-memory", "Use device memory") ("sycl", "Execute on SYCL device") ("d,debug", "enable debug logging") - ("n,size", "Grid size", cxxopts::value()->default_value("128")) + ("n,size", "Grid width", cxxopts::value()->default_value("128")) + ("m,height", "Grid height", cxxopts::value()->default_value("128")) ("t,steps", "Run a fixed number of time steps.", cxxopts::value()->default_value("100")) ("r,redundancy", "Set outer-grid redundancy parameter.", cxxopts::value()->default_value("2")); // clang-format on @@ -215,7 +226,7 @@ Options parse_options(int argc, char *argv[]) { } out.sycl = options.count("sycl") != 0; - out.device_memory = options.count("debug") != 0; + out.device_memory = options.count("device-memory") != 0; if (options.count("drhelp")) { std::cout << options_spec.help() << "\n"; @@ -231,7 +242,8 @@ Options parse_options(int argc, char *argv[]) { } } - out.size = options["n"].as(); + out.width = options["n"].as(); + out.height = options.count("m") != 0 ? options["m"].as() : out.width; out.redundancy = options["r"].as(); out.steps = options["t"].as(); @@ -244,7 +256,7 @@ void dr_init(const Options& options) { #ifdef SYCL_LANGUAGE_VERSION if (options.sycl) { sycl::queue q; - fmt::print("Running on sycl device: {}, memory: {}\n", q.get_device().get_info(), options.device_memory ? "devive" : "shared"); + fmt::print("Running on sycl device: {}, memory: {}\n", q.get_device().get_info(), options.device_memory ? "device" : "shared"); dr::mp::init(q, options.device_memory ? sycl::usm::alloc::device : sycl::usm::alloc::shared); return; @@ -261,7 +273,7 @@ int main(int argc, char *argv[]) { Options options = parse_options(argc, argv); dr_init(options); - GameOfLife::run(options.size, options.redundancy, options.steps, options.debug); + GameOfLife::run(options.width, options.height, options.redundancy, options.steps, options.debug); dr::mp::finalize(); MPI_Finalize(); From f91714e1e053f959a3822606b629909e8a8ef30f Mon Sep 17 00:00:00 2001 From: kc432959 Date: Mon, 27 Jan 2025 17:53:25 +0100 Subject: [PATCH 19/19] Change to select queue (broken) --- test/gtest/mp/game_of_life.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/gtest/mp/game_of_life.cpp b/test/gtest/mp/game_of_life.cpp index 446338c0b9..30d976e679 100644 --- a/test/gtest/mp/game_of_life.cpp +++ b/test/gtest/mp/game_of_life.cpp @@ -255,7 +255,7 @@ Options parse_options(int argc, char *argv[]) { void dr_init(const Options& options) { #ifdef SYCL_LANGUAGE_VERSION if (options.sycl) { - sycl::queue q; + sycl::queue q = dr::mp::select_queue(); fmt::print("Running on sycl device: {}, memory: {}\n", q.get_device().get_info(), options.device_memory ? "device" : "shared"); dr::mp::init(q, options.device_memory ? sycl::usm::alloc::device : sycl::usm::alloc::shared);