Skip to content

Commit 3005f21

Browse files
committed
Cleanup GPU ranking metric.
1 parent c3a15c2 commit 3005f21

File tree

5 files changed

+12
-423
lines changed

5 files changed

+12
-423
lines changed

src/common/device_helpers.cuh

-170
Original file line numberDiff line numberDiff line change
@@ -825,176 +825,6 @@ XGBOOST_DEVICE auto tcrend(xgboost::common::Span<T> const &span) { // NOLINT
825825
return tcrbegin(span) + span.size();
826826
}
827827

828-
// This type sorts an array which is divided into multiple groups. The sorting is influenced
829-
// by the function object 'Comparator'
830-
template <typename T>
831-
class SegmentSorter {
832-
private:
833-
// Items sorted within the group
834-
caching_device_vector<T> ditems_;
835-
836-
// Original position of the items before they are sorted descending within their groups
837-
caching_device_vector<uint32_t> doriginal_pos_;
838-
839-
// Segments within the original list that delineates the different groups
840-
caching_device_vector<uint32_t> group_segments_;
841-
842-
// Need this on the device as it is used in the kernels
843-
caching_device_vector<uint32_t> dgroups_; // Group information on device
844-
845-
// Where did the item that was originally present at position 'x' move to after they are sorted
846-
caching_device_vector<uint32_t> dindexable_sorted_pos_;
847-
848-
// Initialize everything but the segments
849-
void Init(uint32_t num_elems) {
850-
ditems_.resize(num_elems);
851-
852-
doriginal_pos_.resize(num_elems);
853-
thrust::sequence(doriginal_pos_.begin(), doriginal_pos_.end());
854-
}
855-
856-
// Initialize all with group info
857-
void Init(const std::vector<uint32_t> &groups) {
858-
uint32_t num_elems = groups.back();
859-
this->Init(num_elems);
860-
this->CreateGroupSegments(groups);
861-
}
862-
863-
public:
864-
// This needs to be public due to device lambda
865-
void CreateGroupSegments(const std::vector<uint32_t> &groups) {
866-
uint32_t num_elems = groups.back();
867-
group_segments_.resize(num_elems, 0);
868-
869-
dgroups_ = groups;
870-
871-
if (GetNumGroups() == 1) return; // There are no segments; hence, no need to compute them
872-
873-
// Define the segments by assigning a group ID to each element
874-
const uint32_t *dgroups = dgroups_.data().get();
875-
uint32_t ngroups = dgroups_.size();
876-
auto ComputeGroupIDLambda = [=] __device__(uint32_t idx) {
877-
return thrust::upper_bound(thrust::seq, dgroups, dgroups + ngroups, idx) -
878-
dgroups - 1;
879-
}; // NOLINT
880-
881-
thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
882-
thrust::make_counting_iterator(num_elems),
883-
group_segments_.begin(),
884-
ComputeGroupIDLambda);
885-
}
886-
887-
// Accessors that returns device pointer
888-
inline uint32_t GetNumItems() const { return ditems_.size(); }
889-
inline const xgboost::common::Span<const T> GetItemsSpan() const {
890-
return { ditems_.data().get(), ditems_.size() };
891-
}
892-
893-
inline const xgboost::common::Span<const uint32_t> GetOriginalPositionsSpan() const {
894-
return { doriginal_pos_.data().get(), doriginal_pos_.size() };
895-
}
896-
897-
inline const xgboost::common::Span<const uint32_t> GetGroupSegmentsSpan() const {
898-
return { group_segments_.data().get(), group_segments_.size() };
899-
}
900-
901-
inline uint32_t GetNumGroups() const { return dgroups_.size() - 1; }
902-
inline const xgboost::common::Span<const uint32_t> GetGroupsSpan() const {
903-
return { dgroups_.data().get(), dgroups_.size() };
904-
}
905-
906-
inline const xgboost::common::Span<const uint32_t> GetIndexableSortedPositionsSpan() const {
907-
return { dindexable_sorted_pos_.data().get(), dindexable_sorted_pos_.size() };
908-
}
909-
910-
// Sort an array that is divided into multiple groups. The array is sorted within each group.
911-
// This version provides the group information that is on the host.
912-
// The array is sorted based on an adaptable binary predicate. By default a stateless predicate
913-
// is used.
914-
template <typename Comparator = thrust::greater<T>>
915-
void SortItems(const T *ditems, uint32_t item_size, const std::vector<uint32_t> &groups,
916-
const Comparator &comp = Comparator()) {
917-
this->Init(groups);
918-
this->SortItems(ditems, item_size, this->GetGroupSegmentsSpan(), comp);
919-
}
920-
921-
// Sort an array that is divided into multiple groups. The array is sorted within each group.
922-
// This version provides the group information that is on the device.
923-
// The array is sorted based on an adaptable binary predicate. By default a stateless predicate
924-
// is used.
925-
template <typename Comparator = thrust::greater<T>>
926-
void SortItems(const T *ditems, uint32_t item_size,
927-
const xgboost::common::Span<const uint32_t> &group_segments,
928-
const Comparator &comp = Comparator()) {
929-
this->Init(item_size);
930-
931-
// Sort the items that are grouped. We would like to avoid using predicates to perform the sort,
932-
// as thrust resorts to using a merge sort as opposed to a much much faster radix sort
933-
// when comparators are used. Hence, the following algorithm is used. This is done so that
934-
// we can grab the appropriate related values from the original list later, after the
935-
// items are sorted.
936-
//
937-
// Here is the internal representation:
938-
// dgroups_: [ 0, 3, 5, 8, 10 ]
939-
// group_segments_: 0 0 0 | 1 1 | 2 2 2 | 3 3
940-
// doriginal_pos_: 0 1 2 | 3 4 | 5 6 7 | 8 9
941-
// ditems_: 1 0 1 | 2 1 | 1 3 3 | 4 4 (from original items)
942-
//
943-
// Sort the items first and make a note of the original positions in doriginal_pos_
944-
// based on the sort
945-
// ditems_: 4 4 3 3 2 1 1 1 1 0
946-
// doriginal_pos_: 8 9 6 7 3 0 2 4 5 1
947-
// NOTE: This consumes space, but is much faster than some of the other approaches - sorting
948-
// in kernel, sorting using predicates etc.
949-
950-
ditems_.assign(thrust::device_ptr<const T>(ditems),
951-
thrust::device_ptr<const T>(ditems) + item_size);
952-
953-
// Allocator to be used by sort for managing space overhead while sorting
954-
dh::XGBCachingDeviceAllocator<char> alloc;
955-
956-
thrust::stable_sort_by_key(thrust::cuda::par(alloc),
957-
ditems_.begin(), ditems_.end(),
958-
doriginal_pos_.begin(), comp);
959-
960-
if (GetNumGroups() == 1) return; // The entire array is sorted, as it isn't segmented
961-
962-
// Next, gather the segments based on the doriginal_pos_. This is to reflect the
963-
// holisitic item sort order on the segments
964-
// group_segments_c_: 3 3 2 2 1 0 0 1 2 0
965-
// doriginal_pos_: 8 9 6 7 3 0 2 4 5 1 (stays the same)
966-
caching_device_vector<uint32_t> group_segments_c(item_size);
967-
thrust::gather(doriginal_pos_.begin(), doriginal_pos_.end(),
968-
dh::tcbegin(group_segments), group_segments_c.begin());
969-
970-
// Now, sort the group segments so that you may bring the items within the group together,
971-
// in the process also noting the relative changes to the doriginal_pos_ while that happens
972-
// group_segments_c_: 0 0 0 1 1 2 2 2 3 3
973-
// doriginal_pos_: 0 2 1 3 4 6 7 5 8 9
974-
thrust::stable_sort_by_key(thrust::cuda::par(alloc),
975-
group_segments_c.begin(), group_segments_c.end(),
976-
doriginal_pos_.begin(), thrust::less<uint32_t>());
977-
978-
// Finally, gather the original items based on doriginal_pos_ to sort the input and
979-
// to store them in ditems_
980-
// doriginal_pos_: 0 2 1 3 4 6 7 5 8 9 (stays the same)
981-
// ditems_: 1 1 0 2 1 3 3 1 4 4 (from unsorted items - ditems)
982-
thrust::gather(doriginal_pos_.begin(), doriginal_pos_.end(),
983-
thrust::device_ptr<const T>(ditems), ditems_.begin());
984-
}
985-
986-
// Determine where an item that was originally present at position 'x' has been relocated to
987-
// after a sort. Creation of such an index has to be explicitly requested after a sort
988-
void CreateIndexableSortedPositions() {
989-
dindexable_sorted_pos_.resize(GetNumItems());
990-
thrust::scatter(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
991-
thrust::make_counting_iterator(GetNumItems()), // Rearrange indices...
992-
// ...based on this map
993-
dh::tcbegin(GetOriginalPositionsSpan()),
994-
dindexable_sorted_pos_.begin()); // Write results into this
995-
}
996-
};
997-
998828
// Atomic add function for gradients
999829
template <typename OutputGradientT, typename InputGradientT>
1000830
XGBOOST_DEV_INLINE void AtomicAddGpair(OutputGradientT* dest,

src/metric/metric.cc

+2-22
Original file line numberDiff line numberDiff line change
@@ -52,32 +52,13 @@ Metric::Create(const std::string& name, Context const* ctx) {
5252
metric->ctx_ = ctx;
5353
return metric;
5454
}
55-
56-
GPUMetric* GPUMetric::CreateGPUMetric(const std::string& name, Context const* ctx) {
57-
auto metric = CreateMetricImpl<MetricGPUReg>(name);
58-
if (metric == nullptr) {
59-
LOG(WARNING) << "Cannot find a GPU metric builder for metric " << name
60-
<< ". Resorting to the CPU builder";
61-
return nullptr;
62-
}
63-
64-
// Narrowing reference only for the compiler to allow assignment to a base class member.
65-
// As such, using this narrowed reference to refer to derived members will be an illegal op.
66-
// This is moot, as this type is stateless.
67-
auto casted = static_cast<GPUMetric*>(metric);
68-
CHECK(casted);
69-
casted->ctx_ = ctx;
70-
return casted;
71-
}
7255
} // namespace xgboost
7356

7457
namespace dmlc {
7558
DMLC_REGISTRY_ENABLE(::xgboost::MetricReg);
76-
DMLC_REGISTRY_ENABLE(::xgboost::MetricGPUReg);
7759
}
7860

79-
namespace xgboost {
80-
namespace metric {
61+
namespace xgboost::metric {
8162
// List of files that will be force linked in static links.
8263
DMLC_REGISTRY_LINK_TAG(auc);
8364
DMLC_REGISTRY_LINK_TAG(elementwise_metric);
@@ -88,5 +69,4 @@ DMLC_REGISTRY_LINK_TAG(rank_metric);
8869
DMLC_REGISTRY_LINK_TAG(auc_gpu);
8970
DMLC_REGISTRY_LINK_TAG(rank_metric_gpu);
9071
#endif
91-
} // namespace metric
92-
} // namespace xgboost
72+
} // namespace xgboost::metric

src/metric/metric_common.h

+8-48
Original file line numberDiff line numberDiff line change
@@ -23,53 +23,14 @@ class MetricNoCache : public Metric {
2323

2424
double Evaluate(HostDeviceVector<float> const &predts, std::shared_ptr<DMatrix> p_fmat) final {
2525
double result{0.0};
26-
auto const& info = p_fmat->Info();
27-
collective::ApplyWithLabels(info, &result, sizeof(double), [&] {
28-
result = this->Eval(predts, info);
29-
});
26+
auto const &info = p_fmat->Info();
27+
collective::ApplyWithLabels(info, &result, sizeof(double),
28+
[&] { result = this->Eval(predts, info); });
3029
return result;
3130
}
3231
};
3332

34-
// This creates a GPU metric instance dynamically and adds it to the GPU metric registry, if not
35-
// present already. This is created when there is a device ordinal present and if xgboost
36-
// is compiled with CUDA support
37-
struct GPUMetric : public MetricNoCache {
38-
static GPUMetric *CreateGPUMetric(const std::string &name, Context const *tparam);
39-
};
40-
41-
/*!
42-
* \brief Internal registry entries for GPU Metric factory functions.
43-
* The additional parameter const char* param gives the value after @, can be null.
44-
* For example, metric map@3, then: param == "3".
45-
*/
46-
struct MetricGPUReg
47-
: public dmlc::FunctionRegEntryBase<MetricGPUReg,
48-
std::function<Metric * (const char*)> > {
49-
};
50-
51-
/*!
52-
* \brief Macro to register metric computed on GPU.
53-
*
54-
* \code
55-
* // example of registering a objective ndcg@k
56-
* XGBOOST_REGISTER_GPU_METRIC(NDCG_GPU, "ndcg")
57-
* .describe("NDCG metric computer on GPU.")
58-
* .set_body([](const char* param) {
59-
* int at_k = atoi(param);
60-
* return new NDCG(at_k);
61-
* });
62-
* \endcode
63-
*/
64-
65-
// Note: Metric names registered in the GPU registry should follow this convention:
66-
// - GPU metric types should be registered with the same name as the non GPU metric types
67-
#define XGBOOST_REGISTER_GPU_METRIC(UniqueId, Name) \
68-
::xgboost::MetricGPUReg& __make_ ## MetricGPUReg ## _ ## UniqueId ## __ = \
69-
::dmlc::Registry< ::xgboost::MetricGPUReg>::Get()->__REGISTER__(Name)
70-
7133
namespace metric {
72-
7334
// Ranking config to be used on device and host
7435
struct EvalRankConfig {
7536
public:
@@ -81,8 +42,8 @@ struct EvalRankConfig {
8142
};
8243

8344
class PackedReduceResult {
84-
double residue_sum_ { 0 };
85-
double weights_sum_ { 0 };
45+
double residue_sum_{0};
46+
double weights_sum_{0};
8647

8748
public:
8849
XGBOOST_DEVICE PackedReduceResult() {} // NOLINT
@@ -91,16 +52,15 @@ class PackedReduceResult {
9152

9253
XGBOOST_DEVICE
9354
PackedReduceResult operator+(PackedReduceResult const &other) const {
94-
return PackedReduceResult{residue_sum_ + other.residue_sum_,
95-
weights_sum_ + other.weights_sum_};
55+
return PackedReduceResult{residue_sum_ + other.residue_sum_, weights_sum_ + other.weights_sum_};
9656
}
9757
PackedReduceResult &operator+=(PackedReduceResult const &other) {
9858
this->residue_sum_ += other.residue_sum_;
9959
this->weights_sum_ += other.weights_sum_;
10060
return *this;
10161
}
102-
double Residue() const { return residue_sum_; }
103-
double Weights() const { return weights_sum_; }
62+
[[nodiscard]] double Residue() const { return residue_sum_; }
63+
[[nodiscard]] double Weights() const { return weights_sum_; }
10464
};
10565

10666
} // namespace metric

0 commit comments

Comments
 (0)