Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Union that can keep sort order #1834

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/engine/QueryExecutionTree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <vector>

#include "engine/Sort.h"
#include "engine/Union.h"
#include "global/RuntimeParameters.h"

using std::string;
Expand Down Expand Up @@ -164,7 +165,14 @@ std::shared_ptr<QueryExecutionTree> QueryExecutionTree::createSortedTree(
return qet;
}

// Push down sort into Union.
QueryExecutionContext* qec = qet->getRootOperation()->getExecutionContext();
if (auto unionOperation =
std::dynamic_pointer_cast<Union>(qet->getRootOperation())) {
return std::make_shared<QueryExecutionTree>(
qec, unionOperation->createSortedVariant(sortColumns));
}

auto sort = std::make_shared<Sort>(qec, std::move(qet), sortColumns);
return std::make_shared<QueryExecutionTree>(qec, std::move(sort));
}
Expand Down
232 changes: 228 additions & 4 deletions src/engine/Union.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@

#include "engine/CallFixedSize.h"
#include "util/ChunkedForLoop.h"
#include "util/TransparentFunctors.h"
#include "util/CompilerWarnings.h"

const size_t Union::NO_COLUMN = std::numeric_limits<size_t>::max();

Union::Union(QueryExecutionContext* qec,
const std::shared_ptr<QueryExecutionTree>& t1,
const std::shared_ptr<QueryExecutionTree>& t2)
: Operation(qec) {
const std::shared_ptr<QueryExecutionTree>& t2,
std::vector<ColumnIndex> targetOrder)
: Operation(qec), targetOrder_{std::move(targetOrder)} {
AD_CONTRACT_CHECK(t1 && t2);
_subtrees[0] = t1;
_subtrees[1] = t2;
Expand Down Expand Up @@ -45,13 +46,42 @@
AD_CORRECTNESS_CHECK(ql::ranges::all_of(_columnOrigins, [](const auto& el) {
return el[0] != NO_COLUMN || el[1] != NO_COLUMN;
}));

if (!targetOrder_.empty()) {
auto computeSortOrder = [this](bool left) {
vector<ColumnIndex> specificSortOrder;
for (ColumnIndex index : targetOrder_) {
ColumnIndex realIndex = _columnOrigins.at(index).at(!left);
if (realIndex != NO_COLUMN) {
specificSortOrder.push_back(realIndex);
}
}
return specificSortOrder;
};

_subtrees[0] = QueryExecutionTree::createSortedTree(std::move(_subtrees[0]),
computeSortOrder(true));
_subtrees[1] = QueryExecutionTree::createSortedTree(
std::move(_subtrees[1]), computeSortOrder(false));

// Swap children to get cheaper computation
if (_columnOrigins.at(targetOrder_.at(0)).at(1) == NO_COLUMN) {
std::swap(_subtrees[0], _subtrees[1]);
ql::ranges::for_each(_columnOrigins,
[](auto& el) { std::swap(el[0], el[1]); });
}
}
}

string Union::getCacheKeyImpl() const {
std::ostringstream os;
os << _subtrees[0]->getCacheKey() << "\n";
os << "UNION\n";
os << _subtrees[1]->getCacheKey() << "\n";
os << "sort order: ";
for (size_t i : targetOrder_) {
os << i << " ";
}
return std::move(os).str();
}

Expand All @@ -64,7 +94,7 @@
return _columnOrigins.size();
}

vector<ColumnIndex> Union::resultSortedOn() const { return {}; }
vector<ColumnIndex> Union::resultSortedOn() const { return targetOrder_; }

// _____________________________________________________________________________
VariableToColumnMap Union::computeVariableToColumnMap() const {
Expand Down Expand Up @@ -165,6 +195,18 @@
std::shared_ptr<const Result> subRes2 =
_subtrees[1]->getResult(requestLaziness);

// If first sort column is not present in left child, we can fall back to the
// cheap computation because it orders the left child first.
if (!targetOrder_.empty() &&
_columnOrigins.at(targetOrder_.at(0)).at(0) != NO_COLUMN) {
auto generator = computeResultKeepOrder(requestLaziness, std::move(subRes1),
std::move(subRes2));
return requestLaziness
? ProtoResult{std::move(generator), resultSortedOn()}
: ProtoResult{cppcoro::getSingleElement(std::move(generator)),
resultSortedOn()};
}

if (requestLaziness) {
return {computeResultLazily(std::move(subRes1), std::move(subRes2)),
resultSortedOn()};
Expand Down Expand Up @@ -292,3 +334,185 @@
}
return copy;
}

// _____________________________________________________________________________
std::shared_ptr<Operation> Union::createSortedVariant(
const vector<ColumnIndex>& sortOrder) const {
return std::make_shared<Union>(_executionContext, _subtrees.at(0),
_subtrees.at(1), sortOrder);
}

namespace {
struct Wrapper {
const IdTable& idTable_;
const LocalVocab& localVocab_;
};
Result::IdTableVocabPair& moveOrCopy(Result::IdTableVocabPair& element) {
return element;
}
Result::IdTableVocabPair moveOrCopy(const Wrapper& element) {
return {element.idTable_.clone(), element.localVocab_.clone()};
}

Check warning on line 355 in src/engine/Union.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/Union.cpp#L353-L355

Added lines #L353 - L355 were not covered by tests
} // namespace

// _____________________________________________________________________________
// Always inline makes makes a huge difference on large datasets.
template <size_t SPAN_SIZE>
AD_ALWAYS_INLINE bool Union::isSmaller(const auto& row1,
const auto& row2) const {
using StaticRange = std::span<const ColumnIndex, SPAN_SIZE>;
for (auto& col : StaticRange{targetOrder_}) {
ColumnIndex index1 = _columnOrigins.at(col).at(0);
ColumnIndex index2 = _columnOrigins.at(col).at(1);
Comment on lines +365 to +366
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

auto [index1, index2] = _columnOrigins.at(col)
And measure, whether the check in the at() function costs something, and
whether we get a speedup by reordering the columns, such that they are alwasy 0, 1, ... in both inputs and the lookup in _columnOrigins in each loop disappears
(if you want to, first create a separate benchmark, if you don't want to do all the permutation reordering first, although it is rather simple.

if (index1 == NO_COLUMN) {
return true;
}
if (index2 == NO_COLUMN) {
return false;
}
Comment on lines +365 to +372
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could remove all the sort columns after the first NO_COLUM, and precompute an
efficient return value
(If all the columns are equal, do i return true (because there was NO_COLUMN in the left input) or false (because the NO_COLUMN was right, or because all columns were equal.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not quite sure what exactly you want. I can't initially trim targetOrder_, because we still need the original values elsewhere. Otherwise this information is lost. If you're proposing adding a copy that is tailored towards the comparator, to do better loop unrolling I could certainly do that, but I don't think sort orders that just apply to a single child are common, so I'm not sure if it's worth the added complexity.

if (row1[index1] != row2[index2]) {
return row1[index1] < row2[index2];
}
}
return false;
}

// _____________________________________________________________________________
Result::Generator Union::processRemaining(std::vector<ColumnIndex> permutation,
auto& it, auto end,
bool requestLaziness, size_t index,
IdTable& resultTable,
LocalVocab& localVocab) const {
// append the remaining elements
while (it != end) {
if (requestLaziness) {
if (index != 0) {
resultTable.insertAtEnd(it->idTable_, index, std::nullopt, permutation,
Id::makeUndefined());
localVocab.mergeWith(std::span{&it->localVocab_, 1});
co_yield Result::IdTableVocabPair{std::move(resultTable),
std::move(localVocab)};
} else {
if (!resultTable.empty()) {
co_yield Result::IdTableVocabPair{std::move(resultTable),
std::move(localVocab)};
}
auto&& pair = moveOrCopy(*it);
pair.idTable_ = transformToCorrectColumnFormat(std::move(pair.idTable_),
permutation);
co_yield pair;
}
} else {
resultTable.insertAtEnd(it->idTable_, index, std::nullopt, permutation,
Id::makeUndefined());
}
index = 0;
++it;
}
}

// _____________________________________________________________________________
template <int COMPARATOR_WIDTH>
Result::Generator Union::computeResultKeepOrderImpl(
bool requestLaziness, auto range1, auto range2,
std::pair<std::shared_ptr<const Result>, std::shared_ptr<const Result>>)
const {
constexpr size_t extent =
COMPARATOR_WIDTH == 0 ? std::dynamic_extent : COMPARATOR_WIDTH;
IdTable resultTable{getResultWidth(), allocator()};
if (requestLaziness) {
resultTable.reserve(chunkSize);
}
LocalVocab localVocab;
auto it1 = range1.begin();
auto it2 = range2.begin();
size_t index1 = 0;
size_t index2 = 0;
auto pushRow = [this, &resultTable](bool left, const auto& row) {
resultTable.emplace_back();
for (size_t column = 0; column < resultTable.numColumns(); column++) {
ColumnIndex origin = _columnOrigins.at(column).at(!left);
resultTable.at(resultTable.size() - 1, column) =
origin == NO_COLUMN ? Id::makeUndefined() : row[origin];
}
Comment on lines +431 to +437
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Someone should Evalute how costly this " I am doing rowwise pushes on a columnwise data structure" is
(Is this where you spend most of the time?)
We should extend the AddCombinedRowAdder to not only handle joins, but also UNIONs.

};
while (it1 != range1.end() && it2 != range2.end()) {
localVocab.mergeWith(std::span{&it1->localVocab_, 1});
localVocab.mergeWith(std::span{&it2->localVocab_, 1});
while (index1 < it1->idTable_.size() && index2 < it2->idTable_.size()) {
if (isSmaller<extent>(it1->idTable_.at(index1),
it2->idTable_.at(index2))) {
pushRow(true, it1->idTable_.at(index1));
index1++;
} else {
pushRow(false, it2->idTable_.at(index2));
index2++;
}
if (requestLaziness && resultTable.size() >= chunkSize) {
co_yield Result::IdTableVocabPair{std::move(resultTable),
std::move(localVocab)};
resultTable = IdTable{getResultWidth(), allocator()};
resultTable.reserve(chunkSize);
localVocab = LocalVocab{};
}
}
if (index1 == it1->idTable_.size()) {
++it1;
index1 = 0;
}
if (index2 == it2->idTable_.size()) {
++it2;
index2 = 0;
}
}

// append the remaining elements
for (auto& pair :
processRemaining(computePermutation<true>(), it1, range1.end(),
requestLaziness, index1, resultTable, localVocab)) {
AD_CORRECTNESS_CHECK(requestLaziness);
co_yield pair;
}
for (auto& pair :
processRemaining(computePermutation<false>(), it2, range2.end(),
requestLaziness, index2, resultTable, localVocab)) {
AD_CORRECTNESS_CHECK(requestLaziness);
co_yield pair;
}
if (!requestLaziness) {
co_yield Result::IdTableVocabPair{std::move(resultTable),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are moving away from coroutines for our C++17 backport,
please consider which types of ranges or state machines (see the end of the Iterators.h header)
can be used here.
This unfortunately will have some more lines of code, but we will benefit from it.

std::move(localVocab)};
}
}

// _____________________________________________________________________________
Result::Generator Union::computeResultKeepOrder(
bool requestLaziness, std::shared_ptr<const Result> result1,
std::shared_ptr<const Result> result2) const {
using Range = std::variant<Result::LazyResult, std::array<Wrapper, 1>>;
Range leftRange = result1->isFullyMaterialized()
? Range{std::array{Wrapper{result1->idTable(),
result1->localVocab()}}}
: Range{std::move(result1->idTables())};
Range rightRange = result2->isFullyMaterialized()
? Range{std::array{Wrapper{result2->idTable(),
result2->localVocab()}}}
: Range{std::move(result2->idTables())};

// Clang falsely reports that the this capture is unused here.
DISABLE_UNUSED_LAMBDA_CAPTURE_WARNINGS
return std::visit(
[this, requestLaziness, &result1, &result2](auto leftRange,
auto rightRange) {
return ad_utility::callFixedSize(
targetOrder_.size(),
[this, requestLaziness, &result1, &result2, &leftRange,
&rightRange]<int COMPARATOR_WIDTH>() {
return computeResultKeepOrderImpl<COMPARATOR_WIDTH>(
requestLaziness, std::move(leftRange), std::move(rightRange),
std::pair{std::move(result1), std::move(result2)});
});
},
std::move(leftRange), std::move(rightRange));
ENABLE_UNUSED_LAMBDA_CAPTURE_WARNINGS
}
35 changes: 34 additions & 1 deletion src/engine/Union.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ class Union : public Operation {
*/
std::vector<std::array<size_t, 2>> _columnOrigins;
std::array<std::shared_ptr<QueryExecutionTree>, 2> _subtrees;
std::vector<ColumnIndex> targetOrder_;

public:
Union(QueryExecutionContext* qec,
const std::shared_ptr<QueryExecutionTree>& t1,
const std::shared_ptr<QueryExecutionTree>& t2);
const std::shared_ptr<QueryExecutionTree>& t2,
std::vector<ColumnIndex> targetOrder = {});

protected:
virtual string getCacheKeyImpl() const override;
Expand Down Expand Up @@ -61,6 +63,14 @@ class Union : public Operation {
return {_subtrees[0].get(), _subtrees[1].get()};
}

// Create a sorted variant of this operation. This can be more efficient than
// stacking a `Sort` operation on top of this one because Union can simply
// push the sort down to its children. If one of the children is already
// sorted properly then it is way cheaper to sort the other child and then
// merge the two sorted results.
std::shared_ptr<Operation> createSortedVariant(
const vector<ColumnIndex>& sortColumns) const;

private:
std::unique_ptr<Operation> cloneImpl() const override;

Expand All @@ -85,4 +95,27 @@ class Union : public Operation {
Result::Generator computeResultLazily(
std::shared_ptr<const Result> result1,
std::shared_ptr<const Result> result2) const;

// Compares two rows with respect to the columns that the result is sorted on.
template <size_t SPAN_SIZE>
bool isSmaller(const auto& row1, const auto& row2) const;

// Helper function for `computeResultKeepOrderImpl` that processes any
// remaining elements once one side is exhausted.
Result::Generator processRemaining(std::vector<ColumnIndex> permutation,
auto& it, auto end, bool requestLaziness,
size_t index, IdTable& resultTable,
LocalVocab& localVocab) const;

// Actual implementation of `computeResultKeepOrder`.
template <int COMPARATOR_WIDTH>
Result::Generator computeResultKeepOrderImpl(
bool requestLaziness, auto range1, auto range2,
std::pair<std::shared_ptr<const Result>, std::shared_ptr<const Result>>
lifetimeExtension) const;

// Similar to `computeResultLazily` but it keeps the order of the results.
Result::Generator computeResultKeepOrder(
bool requestLaziness, std::shared_ptr<const Result> result1,
std::shared_ptr<const Result> result2) const;
};
Loading
Loading