From 3fc55514e16c2e75add4e01706b27cc6487a2be3 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 17 Apr 2024 21:46:56 +0200 Subject: [PATCH 1/5] Use `LiteralOrIri` instead of `std::string` for expression results (#1318) So far, intermediate results of expressions that were not `Id`s, were of type `std::string`. As a consequence, local vocab entries that were the results of expressions were missing the quotes in their internal representation. Now, all expression results have a proper type, namely `Id` or `LiteralOrIri`, and entries in the local vocab have a consistent representation. --- src/engine/GroupBy.cpp | 2 +- src/engine/GroupByHashMapOptimization.h | 14 +- .../sparqlExpressions/AggregateExpression.h | 36 +- .../ConditionalExpressions.cpp | 14 +- .../sparqlExpressions/GroupConcatExpression.h | 4 +- .../sparqlExpressions/LiteralExpression.h | 11 +- .../RelationalExpressionHelpers.h | 37 +- .../RelationalExpressions.cpp | 13 +- .../SparqlExpressionTypes.cpp | 21 +- .../sparqlExpressions/SparqlExpressionTypes.h | 30 +- .../SparqlExpressionValueGetters.h | 57 ++-- .../sparqlExpressions/StringExpressions.cpp | 112 +++--- src/parser/Literal.cpp | 2 +- src/parser/Literal.h | 12 +- src/parser/LiteralOrIri.h | 7 + test/AggregateExpressionTest.cpp | 24 +- test/LocalVocabTest.cpp | 4 +- test/RelationalExpressionTest.cpp | 99 +++--- test/SparqlExpressionTest.cpp | 319 +++++++++++------- test/SparqlExpressionTypesTest.cpp | 8 +- test/StringUtilsTest.cpp | 8 + test/parser/LiteralOrIriTest.cpp | 15 + test/util/TripleComponentTestHelpers.h | 7 +- 23 files changed, 503 insertions(+), 353 deletions(-) diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index 160015b493..4bd05f36f2 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -915,7 +915,7 @@ void GroupBy::extractValues( auto targetIterator = resultTable->getColumn(outCol).begin() + evaluationContext._beginIndex; - for (sparqlExpression::IdOrString val : generator) { + for (sparqlExpression::IdOrLiteralOrIri val : generator) { *targetIterator = sparqlExpression::detail::constantExpressionResultToId( std::move(val), *localVocab); ++targetIterator; diff --git a/src/engine/GroupByHashMapOptimization.h b/src/engine/GroupByHashMapOptimization.h index 76537960a3..afc57a6f77 100644 --- a/src/engine/GroupByHashMapOptimization.h +++ b/src/engine/GroupByHashMapOptimization.h @@ -69,11 +69,11 @@ struct CountAggregationData { // Data to perform MIN/MAX aggregation using the HashMap optimization. template struct ExtremumAggregationData { - sparqlExpression::IdOrString currentValue_; + sparqlExpression::IdOrLiteralOrIri currentValue_; bool firstValueSet_ = false; // _____________________________________________________________________________ - void addValue(const sparqlExpression::IdOrString& value, + void addValue(const sparqlExpression::IdOrLiteralOrIri& value, const sparqlExpression::EvaluationContext* ctx) { if (!firstValueSet_) { currentValue_ = value; @@ -88,10 +88,12 @@ struct ExtremumAggregationData { // _____________________________________________________________________________ [[nodiscard]] ValueId calculateResult(LocalVocab* localVocab) const { auto valueIdResultGetter = [](ValueId id) { return id; }; - auto stringResultGetter = [localVocab](const std::string& str) { - auto localVocabIndex = localVocab->getIndexAndAddIfNotContained(str); - return ValueId::makeFromLocalVocabIndex(localVocabIndex); - }; + auto stringResultGetter = + [localVocab](const ad_utility::triple_component::LiteralOrIri& str) { + auto localVocabIndex = localVocab->getIndexAndAddIfNotContained( + str.toStringRepresentation()); + return ValueId::makeFromLocalVocabIndex(localVocabIndex); + }; return std::visit(ad_utility::OverloadCallOperator(valueIdResultGetter, stringResultGetter), currentValue_); diff --git a/src/engine/sparqlExpressions/AggregateExpression.h b/src/engine/sparqlExpressions/AggregateExpression.h index 23bc06dec6..e19646ea11 100644 --- a/src/engine/sparqlExpressions/AggregateExpression.h +++ b/src/engine/sparqlExpressions/AggregateExpression.h @@ -238,32 +238,32 @@ using AvgExpression = // TODO Comment template inline const auto compareIdsOrStrings = - [](const T& a, const U& b, - const EvaluationContext* ctx) -> IdOrString { + []( + const T& a, const U& b, + const EvaluationContext* ctx) -> IdOrLiteralOrIri { // TODO moveTheStrings. return toBoolNotUndef( sparqlExpression::compareIdsOrStrings< Comp, valueIdComparators::ComparisonForIncompatibleTypes:: CompareByType>(a, b, ctx)) - ? IdOrString{a} - : IdOrString{b}; + ? IdOrLiteralOrIri{a} + : IdOrLiteralOrIri{b}; }; // Min and Max. template -inline const auto minMaxLambdaForAllTypes = []( - const T& a, const T& b, - const EvaluationContext* ctx) { - auto actualImpl = [ctx](const auto& x, const auto& y) { - return compareIdsOrStrings(x, y, ctx); - }; - if constexpr (ad_utility::isSimilar) { - return std::get(actualImpl(a, b)); - } else { - auto base = [](const IdOrString& i) -> const IdOrStringBase& { return i; }; - // TODO We should definitely move strings here. - return std::visit(actualImpl, base(a), base(b)); - } -}; +inline const auto minMaxLambdaForAllTypes = + [](const T& a, const T& b, + const EvaluationContext* ctx) { + auto actualImpl = [ctx](const auto& x, const auto& y) { + return compareIdsOrStrings(x, y, ctx); + }; + if constexpr (ad_utility::isSimilar) { + return std::get(actualImpl(a, b)); + } else { + // TODO We should definitely move strings here. + return std::visit(actualImpl, a, b); + } + }; constexpr inline auto minLambdaForAllTypes = minMaxLambdaForAllTypes; diff --git a/src/engine/sparqlExpressions/ConditionalExpressions.cpp b/src/engine/sparqlExpressions/ConditionalExpressions.cpp index df66810448..29aec5a7b2 100644 --- a/src/engine/sparqlExpressions/ConditionalExpressions.cpp +++ b/src/engine/sparqlExpressions/ConditionalExpressions.cpp @@ -14,8 +14,8 @@ using namespace sparqlExpression::detail; [[maybe_unused]] auto ifImpl = []( EffectiveBooleanValueGetter::Result condition, T&& i, - U&& e) -> IdOrString requires std::is_rvalue_reference_v && - std::is_rvalue_reference_v { + U&& e) -> IdOrLiteralOrIri requires std::is_rvalue_reference_v && + std::is_rvalue_reference_v { if (condition == EffectiveBooleanValueGetter::Result::True) { return AD_FWD(i); } else { @@ -50,16 +50,16 @@ class CoalesceExpression : public VariadicExpression { 0, ctx->size(), [&unboundIndices](size_t i) { unboundIndices.push_back(i); }, [ctx]() { ctx->cancellationHandle_->throwIfCancelled(); }); - VectorWithMemoryLimit result{ctx->_allocator}; + VectorWithMemoryLimit result{ctx->_allocator}; std::fill_n(std::back_inserter(result), ctx->size(), - IdOrString{Id::makeUndefined()}); + IdOrLiteralOrIri{Id::makeUndefined()}); if (result.empty()) { return result; } ctx->cancellationHandle_->throwIfCancelled(); - auto isUnbound = [](const IdOrString& x) { + auto isUnbound = [](const IdOrLiteralOrIri& x) { return (std::holds_alternative(x) && std::get(x) == Id::makeUndefined()); }; @@ -68,7 +68,7 @@ class CoalesceExpression : public VariadicExpression { [&nextUnboundIndices, &unboundIndices, &isUnbound, &result, ctx ](T && childResult) requires isConstantResult { - IdOrString constantResult{AD_FWD(childResult)}; + IdOrLiteralOrIri constantResult{AD_FWD(childResult)}; if (isUnbound(constantResult)) { nextUnboundIndices = std::move(unboundIndices); return; @@ -109,7 +109,7 @@ class CoalesceExpression : public VariadicExpression { // Skip all the indices where the result is already bound from a // previous child. if (i == *unboundIdxIt) { - if (IdOrString val{std::move(*generatorIterator)}; + if (IdOrLiteralOrIri val{std::move(*generatorIterator)}; isUnbound(val)) { nextUnboundIndices.push_back(i); } else { diff --git a/src/engine/sparqlExpressions/GroupConcatExpression.h b/src/engine/sparqlExpressions/GroupConcatExpression.h index f6b985386b..793857b676 100644 --- a/src/engine/sparqlExpressions/GroupConcatExpression.h +++ b/src/engine/sparqlExpressions/GroupConcatExpression.h @@ -53,7 +53,9 @@ class GroupConcatExpression : public SparqlExpression { groupConcatImpl(std::move(generator)); } result.shrink_to_fit(); - return IdOrString(std::move(result)); + return IdOrLiteralOrIri(ad_utility::triple_component::LiteralOrIri{ + ad_utility::triple_component::Literal::literalWithNormalizedContent( + asNormalizedStringViewUnsafe(result))}); }; auto childRes = child_->evaluate(context); diff --git a/src/engine/sparqlExpressions/LiteralExpression.h b/src/engine/sparqlExpressions/LiteralExpression.h index 729a240dc0..92e84957db 100644 --- a/src/engine/sparqlExpressions/LiteralExpression.h +++ b/src/engine/sparqlExpressions/LiteralExpression.h @@ -21,7 +21,7 @@ class LiteralExpression : public SparqlExpression { // make the `const` evaluate function threadsafe and lock-free. // TODO Make this unnecessary by completing multiple small groups at // once during the GROUP BY. - mutable std::atomic cachedResult_ = nullptr; + mutable std::atomic cachedResult_ = nullptr; public: // _________________________________________________________________________ @@ -49,10 +49,11 @@ class LiteralExpression : public SparqlExpression { return *ptr; } auto id = context->_qec.getIndex().getId(s); - IdOrString result = - id.has_value() ? IdOrString{id.value()} - : IdOrString{std::string{s.toStringRepresentation()}}; - auto ptrForCache = std::make_unique(result); + IdOrLiteralOrIri result = + id.has_value() + ? IdOrLiteralOrIri{id.value()} + : IdOrLiteralOrIri{ad_utility::triple_component::LiteralOrIri{s}}; + auto ptrForCache = std::make_unique(result); ptrForCache.reset(std::atomic_exchange_explicit( &cachedResult_, ptrForCache.release(), std::memory_order_relaxed)); context->cancellationHandle_->throwIfCancelled(); diff --git a/src/engine/sparqlExpressions/RelationalExpressionHelpers.h b/src/engine/sparqlExpressions/RelationalExpressionHelpers.h index 4d1cd53a7e..4c9f24d8be 100644 --- a/src/engine/sparqlExpressions/RelationalExpressionHelpers.h +++ b/src/engine/sparqlExpressions/RelationalExpressionHelpers.h @@ -113,13 +113,16 @@ constexpr Comparison getComparisonForSwappedArguments(Comparison comp) { // collation level. // TODO Make the collation level configurable. inline std::pair getRangeFromVocab( - const std::string& s, const EvaluationContext* context) { + const ad_utility::triple_component::LiteralOrIri& s, + const EvaluationContext* context) { auto level = TripleComponentComparator::Level::QUARTERNARY; // TODO This should be `Vocab::equal_range` - const ValueId lower = Id::makeFromVocabIndex( - context->_qec.getIndex().getVocab().lower_bound(s, level)); - const ValueId upper = Id::makeFromVocabIndex( - context->_qec.getIndex().getVocab().upper_bound(s, level)); + const ValueId lower = + Id::makeFromVocabIndex(context->_qec.getIndex().getVocab().lower_bound( + s.toStringRepresentation(), level)); + const ValueId upper = + Id::makeFromVocabIndex(context->_qec.getIndex().getVocab().upper_bound( + s.toStringRepresentation(), level)); return {lower, upper}; } @@ -127,10 +130,11 @@ inline std::pair getRangeFromVocab( // consecutive range of IDs. For its usage see below. template concept StoresStringOrId = - ad_utility::SimilarToAny>; -// Convert a string or `IdOrString` value into the (possibly empty) range of -// corresponding `ValueIds` (denoted by a `std::pair`, see + ad_utility::SimilarToAny>; +// Convert a string or `IdOrLiteralOrIri` value into the (possibly empty) range +// of corresponding `ValueIds` (denoted by a `std::pair`, see // `getRangeFromVocab` above for details). This function also takes `ValueId`s // and `pair` which are simply returned unchanged. This makes // the usage of this function easier. @@ -140,7 +144,7 @@ auto makeValueId(const S& value, const EvaluationContext* context) { return value; } else if constexpr (ad_utility::isSimilar>) { return value; - } else if constexpr (ad_utility::isSimilar) { + } else if constexpr (ad_utility::isSimilar) { auto visitor = [context](const auto& x) { auto res = makeValueId(x, context); if constexpr (ad_utility::isSimilar) { @@ -152,10 +156,11 @@ auto makeValueId(const S& value, const EvaluationContext* context) { return res; } }; - return value.visit(visitor); + return std::visit(visitor, value); } else { - static_assert(ad_utility::isSimilar); + static_assert( + ad_utility::isSimilar); return getRangeFromVocab(value, context); } }; @@ -171,11 +176,13 @@ inline const auto compareIdsOrStrings = []( const T& a, const U& b, const EvaluationContext* ctx) -> valueIdComparators::ComparisonResult { - if constexpr (ad_utility::isSimilar && - ad_utility::isSimilar) { + using LiteralOrIri = ad_utility::triple_component::LiteralOrIri; + if constexpr (ad_utility::isSimilar && + ad_utility::isSimilar) { // TODO integrate comparison via ICU and proper handling for // IRIs/ Literals/etc. - return valueIdComparators::fromBool(applyComparison(a, b)); + return valueIdComparators::fromBool(applyComparison( + a.toStringRepresentation(), b.toStringRepresentation())); } else { auto x = makeValueId(a, ctx); auto y = makeValueId(b, ctx); diff --git a/src/engine/sparqlExpressions/RelationalExpressions.cpp b/src/engine/sparqlExpressions/RelationalExpressions.cpp index 0a328c7bf5..a99d67180c 100644 --- a/src/engine/sparqlExpressions/RelationalExpressions.cpp +++ b/src/engine/sparqlExpressions/RelationalExpressions.cpp @@ -159,9 +159,9 @@ requires AreComparable ExpressionResult evaluateRelationalExpression( return std::nullopt; }; std::optional resultFromBinarySearch; - if constexpr (ad_utility::isSimilar) { + if constexpr (ad_utility::isSimilar) { resultFromBinarySearch = - value2.visit([&impl](const auto& x) { return impl(x); }); + std::visit([&impl](const auto& x) { return impl(x); }, value2); } else { resultFromBinarySearch = impl(value2); } @@ -186,14 +186,7 @@ requires AreComparable ExpressionResult evaluateRelationalExpression( AlwaysUndef>(x, y, context))); } }; - auto base = [](const I& arg) -> decltype(auto) { - if constexpr (ad_utility::isSimilar) { - return static_cast(arg); - } else { - return arg; - } - }; - ad_utility::visitWithVariantsAndParameters(impl, base(*itA), base(*itB)); + ad_utility::visitWithVariantsAndParameters(impl, *itA, *itB); ++itA; ++itB; context->cancellationHandle_->throwIfCancelled(); diff --git a/src/engine/sparqlExpressions/SparqlExpressionTypes.cpp b/src/engine/sparqlExpressions/SparqlExpressionTypes.cpp index 9dd5a0be07..36fec1d212 100644 --- a/src/engine/sparqlExpressions/SparqlExpressionTypes.cpp +++ b/src/engine/sparqlExpressions/SparqlExpressionTypes.cpp @@ -6,17 +6,18 @@ namespace sparqlExpression { -IdOrString::IdOrString(std::optional s) { - if (s.has_value()) { - emplace(std::move(s.value())); - } else { - emplace(Id::makeUndefined()); - } -} - // _____________________________________________________________________________ -void PrintTo(const IdOrString& var, std::ostream* os) { - std::visit([&os](const auto& s) { *os << s; }, var); +void PrintTo(const IdOrLiteralOrIri& var, std::ostream* os) { + std::visit( + [&os](const T& s) { + auto& stream = *os; + if constexpr (std::same_as) { + stream << s; + } else { + stream << s.toStringRepresentation(); + } + }, + var); } // _____________________________________________________________________________ diff --git a/src/engine/sparqlExpressions/SparqlExpressionTypes.h b/src/engine/sparqlExpressions/SparqlExpressionTypes.h index 297a73f19d..dc753e5b04 100644 --- a/src/engine/sparqlExpressions/SparqlExpressionTypes.h +++ b/src/engine/sparqlExpressions/SparqlExpressionTypes.h @@ -11,6 +11,8 @@ #include "engine/QueryExecutionContext.h" #include "engine/sparqlExpressions/SetOfIntervals.h" #include "global/Id.h" +#include "parser/LiteralOrIri.h" +#include "parser/TripleComponent.h" #include "parser/data/Variable.h" #include "util/AllocatorWithLimit.h" #include "util/HashSet.h" @@ -51,19 +53,11 @@ class VectorWithMemoryLimit // A class to store the results of expressions that can yield strings or IDs as // their result (for example IF and COALESCE). It is also used for expressions -// that can only yield strings. It is currently implemented as a thin wrapper -// around a `variant`, but a more sophisticated implementation could be done in -// the future. -using IdOrStringBase = std::variant; -class IdOrString : public IdOrStringBase, - public VisitMixin { - public: - using IdOrStringBase::IdOrStringBase; - explicit IdOrString(std::optional s); -}; - -// Print an `IdOrString` for googletest. -void PrintTo(const IdOrString& var, std::ostream* os); +// that can only yield strings. +using IdOrLiteralOrIri = + std::variant; +// Printing for GTest. +void PrintTo(const IdOrLiteralOrIri& var, std::ostream* os); /// The result of an expression can either be a vector of bool/double/int/string /// a variable (e.g. in BIND (?x as ?y)) or a "Set" of indices, which identifies @@ -72,7 +66,7 @@ void PrintTo(const IdOrString& var, std::ostream* os); namespace detail { // For each type T in this tuple, T as well as VectorWithMemoryLimit are // possible expression result types. -using ConstantTypes = std::tuple; +using ConstantTypes = std::tuple; using ConstantTypesAsVector = ad_utility::LiftedTuple; @@ -195,12 +189,14 @@ requires isConstantResult && std::is_rvalue_reference_v Id constantExpressionResultToId(T&& result, LocalVocabT& localVocab) { if constexpr (ad_utility::isSimilar) { return result; - } else if constexpr (ad_utility::isSimilar) { + } else if constexpr (ad_utility::isSimilar) { return std::visit( [&localVocab](R&& el) mutable { - if constexpr (ad_utility::isSimilar) { + if constexpr (ad_utility::isSimilar< + R, ad_utility::triple_component::LiteralOrIri>) { return Id::makeFromLocalVocabIndex( - localVocab.getIndexAndAddIfNotContained(std::forward(el))); + localVocab.getIndexAndAddIfNotContained( + AD_FWD(el).toStringRepresentation())); } else { static_assert(ad_utility::isSimilar); return el; diff --git a/src/engine/sparqlExpressions/SparqlExpressionValueGetters.h b/src/engine/sparqlExpressions/SparqlExpressionValueGetters.h index f124121b12..95c9f270b7 100644 --- a/src/engine/sparqlExpressions/SparqlExpressionValueGetters.h +++ b/src/engine/sparqlExpressions/SparqlExpressionValueGetters.h @@ -20,6 +20,8 @@ namespace sparqlExpression::detail { +using LiteralOrIri = ad_utility::triple_component::LiteralOrIri; + // An empty struct to represent a non-numeric value in a context where only // numeric values make sense. struct NotNumeric {}; @@ -51,11 +53,12 @@ Id makeNumericId(T t) { } // All the numeric value getters have an `operator()` for `ValueId` and one for -// `std::string`. This mixin adds the `operator()` for the `IdOrString` variant -// via the CRTP pattern. +// `std::string`. This mixin adds the `operator()` for the `IdOrLiteralOrIri` +// variant via the CRTP pattern. template struct Mixin { - decltype(auto) operator()(IdOrString s, const EvaluationContext* ctx) const { + decltype(auto) operator()(IdOrLiteralOrIri s, + const EvaluationContext* ctx) const { return std::visit( [this, ctx](auto el) { return static_cast(this)->operator()(el, ctx); @@ -67,7 +70,7 @@ struct Mixin { // Return `NumericValue` which is then used as the input to numeric expressions. struct NumericValueGetter : Mixin { using Mixin::operator(); - NumericValue operator()(const string&, const EvaluationContext*) const { + NumericValue operator()(const LiteralOrIri&, const EvaluationContext*) const { return NotNumeric{}; } @@ -91,7 +94,7 @@ struct IsValidValueGetter : Mixin { // check for NULL/UNDEF values. bool operator()(ValueId id, const EvaluationContext*) const; - bool operator()(const string&, const EvaluationContext*) const { + bool operator()(const LiteralOrIri&, const EvaluationContext*) const { return true; } }; @@ -105,8 +108,8 @@ struct EffectiveBooleanValueGetter : Mixin { Result operator()(ValueId id, const EvaluationContext*) const; // Nonempty strings are true. - Result operator()(const std::string& s, const EvaluationContext*) const { - return s.empty() ? Result::False : Result::True; + Result operator()(const LiteralOrIri& s, const EvaluationContext*) const { + return s.getContent().empty() ? Result::False : Result::True; } }; @@ -116,13 +119,11 @@ struct StringValueGetter : Mixin { using Mixin::operator(); std::optional operator()(ValueId, const EvaluationContext*) const; - std::optional operator()(string s, const EvaluationContext*) const { - // Strip quotes - // TODO Use stronger types to encode literals/ IRIs/ ETC - if (s.size() >= 2 && s.starts_with('"') && s.ends_with('"')) { - return s.substr(1, s.size() - 2); - } - return s; + // TODO probably we should return a reference or a view here. + // TODO use a `NormalizedStringView` inside the expressions. + std::optional operator()(const LiteralOrIri& s, + const EvaluationContext*) const { + return std::string(asStringViewUnsafe(s.getContent())); } }; @@ -133,20 +134,24 @@ struct IsBlankNodeValueGetter : Mixin { return Id::makeFromBool(id.getDatatype() == Datatype::BlankNodeIndex); } - Id operator()(const std::string&, const EvaluationContext*) const { + Id operator()(const LiteralOrIri&, const EvaluationContext*) const { return Id::makeFromBool(false); } }; // Value getters for `isIRI`, `isBlank`, and `isLiteral`. -template +template struct IsSomethingValueGetter - : Mixin> { + : Mixin> { using Mixin::operator(); Id operator()(ValueId id, const EvaluationContext* context) const; - Id operator()(const std::string& s, const EvaluationContext*) const { - return Id::makeFromBool(s.starts_with(prefix)); + Id operator()(const LiteralOrIri& s, const EvaluationContext*) const { + // TODO Use the `isLiteral` etc. functions directly as soon as the + // local vocabulary also stores `LiteralOrIri`. + return Id::makeFromBool(s.toStringRepresentation().starts_with( + isLiteralOrIriSomethingFunction)); } }; static constexpr auto isIriPrefix = ad_utility::ConstexprSmallString<2>{"<"}; @@ -166,7 +171,7 @@ struct IsNumericValueGetter : Mixin { return Id::makeFromBool(datatype == Datatype::Double || datatype == Datatype::Int); } - Id operator()(const std::string&, const EvaluationContext*) const { + Id operator()(const LiteralOrIri&, const EvaluationContext*) const { return Id::makeFromBool(false); } }; @@ -185,7 +190,7 @@ struct DateValueGetter : Mixin { } } - Opt operator()(const std::string&, const EvaluationContext*) const { + Opt operator()(const LiteralOrIri&, const EvaluationContext*) const { return std::nullopt; } }; @@ -198,14 +203,12 @@ struct LiteralFromIdGetter : Mixin { using Mixin::operator(); std::optional operator()(ValueId id, const EvaluationContext* context) const; - std::optional operator()(std::string s, + std::optional operator()(const LiteralOrIri& s, const EvaluationContext*) const { - // Strip quotes - // TODO Use stronger types to encode literals/ IRIs/ ETC - if (s.size() >= 2 && s.starts_with('"') && s.ends_with('"')) { - return s.substr(1, s.size() - 2); + if (s.isIri()) { + return std::nullopt; } - return s; + return std::string{asStringViewUnsafe(s.getContent())}; } }; diff --git a/src/engine/sparqlExpressions/StringExpressions.cpp b/src/engine/sparqlExpressions/StringExpressions.cpp index 43c3272dc4..7e48871a94 100644 --- a/src/engine/sparqlExpressions/StringExpressions.cpp +++ b/src/engine/sparqlExpressions/StringExpressions.cpp @@ -9,9 +9,26 @@ namespace sparqlExpression { namespace detail::string_expressions { + +using LiteralOrIri = ad_utility::triple_component::LiteralOrIri; + +// Convert a `string_view` to a `LiteralOrIri` that stores a `Literal`. +// Note: This currently requires a copy of a string since the `Literal` class +// has to add the quotation marks. +constexpr auto toLiteral = [](std::string_view normalizedContent) { + return LiteralOrIri{ + ad_utility::triple_component::Literal::literalWithNormalizedContent( + asNormalizedStringViewUnsafe(normalizedContent))}; +}; + // String functions. -[[maybe_unused]] auto strImpl = [](std::optional s) { - return IdOrString{std::move(s)}; +[[maybe_unused]] auto strImpl = + [](std::optional s) -> IdOrLiteralOrIri { + if (s.has_value()) { + return IdOrLiteralOrIri{toLiteral(s.value())}; + } else { + return Id::makeUndefined(); + } }; NARY_EXPRESSION(StrExpressionImpl, 1, FV); @@ -70,10 +87,11 @@ class StringExpressionImpl : public SparqlExpression { // Lift a `Function` that takes one or multiple `std::string`s (possibly via // references) and returns an `Id` or `std::string` to a function that takes the -// same number of `std::optional` and returns `Id` or `IdOrString`. -// If any of the optionals is `std::nullopt`, then UNDEF is returned, else the -// result of the `Function` with the values of the optionals. This is a useful -// helper function for implementing expressions that work on strings. +// same number of `std::optional` and returns `Id` or +// `IdOrLiteralOrIri`. If any of the optionals is `std::nullopt`, then UNDEF is +// returned, else the result of the `Function` with the values of the optionals. +// This is a useful helper function for implementing expressions that work on +// strings. template struct LiftStringFunction { template >... Arguments> @@ -81,12 +99,12 @@ struct LiftStringFunction { using ResultOfFunction = decltype(std::invoke(Function{}, std::move(arguments.value())...)); static_assert(std::same_as || - std::same_as, + std::same_as, "Template argument of `LiftStringFunction` must return `Id` " "or `std::string`"); using Result = std::conditional_t, Id, - IdOrString>; + IdOrLiteralOrIri>; if ((... || !arguments.has_value())) { return Result{Id::makeUndefined()}; } @@ -104,22 +122,22 @@ using StrlenExpression = // LCASE [[maybe_unused]] auto lowercaseImpl = - [](std::optional input) -> IdOrString { + [](std::optional input) -> IdOrLiteralOrIri { if (!input.has_value()) { return Id::makeUndefined(); } else { - return ad_utility::utf8ToLower(input.value()); + return toLiteral(ad_utility::utf8ToLower(input.value())); } }; using LowercaseExpression = StringExpressionImpl<1, decltype(lowercaseImpl)>; // UCASE [[maybe_unused]] auto uppercaseImpl = - [](std::optional input) -> IdOrString { + [](std::optional input) -> IdOrLiteralOrIri { if (!input.has_value()) { return Id::makeUndefined(); } else { - return ad_utility::utf8ToUpper(input.value()); + return toLiteral(ad_utility::utf8ToUpper(input.value())); } }; using UppercaseExpression = StringExpressionImpl<1, decltype(uppercaseImpl)>; @@ -149,15 +167,15 @@ class SubstrImpl { }; public: - IdOrString operator()(std::optional s, NumericValue start, - NumericValue length) const { + IdOrLiteralOrIri operator()(std::optional s, NumericValue start, + NumericValue length) const { if (!s.has_value() || std::holds_alternative(start) || std::holds_alternative(length)) { return Id::makeUndefined(); } if (isNan(start) || isNan(length)) { - return std::string{}; + return toLiteral(""); } // In SPARQL indices are 1-based, but the implementation we use is 0 based, @@ -185,8 +203,8 @@ class SubstrImpl { return static_cast(n); }; - return std::string{ - ad_utility::getUTF8Substring(str, clamp(startInt), clamp(lengthInt))}; + return toLiteral( + ad_utility::getUTF8Substring(str, clamp(startInt), clamp(lengthInt))); } }; @@ -226,22 +244,22 @@ using ContainsExpression = // STRAFTER / STRBEFORE template [[maybe_unused]] auto strAfterOrBeforeImpl = - [](std::string text, std::string_view pattern) -> std::string { - // Required by the SPARQL standard. - if (pattern.empty()) { - return text; - } - auto pos = text.find(pattern); - if (pos >= text.size()) { - return ""; - } - if constexpr (isStrAfter) { - return text.substr(pos + pattern.size()); - } else { - // STRBEFORE - return text.substr(0, pos); - } -}; + [](std::string_view text, std::string_view pattern) { + // Required by the SPARQL standard. + if (pattern.empty()) { + return toLiteral(text); + } + auto pos = text.find(pattern); + if (pos >= text.size()) { + return toLiteral(""); + } + if constexpr (isStrAfter) { + return toLiteral(text.substr(pos + pattern.size())); + } else { + // STRBEFORE + return toLiteral(text.substr(0, pos)); + } + }; auto strAfter = strAfterOrBeforeImpl; @@ -257,7 +275,7 @@ using StrBeforeExpression = [[maybe_unused]] auto replaceImpl = [](std::optional input, const std::unique_ptr& pattern, - const std::optional& replacement) -> IdOrString { + const std::optional& replacement) -> IdOrLiteralOrIri { if (!input.has_value() || !pattern || !replacement.has_value()) { return Id::makeUndefined(); } @@ -269,7 +287,7 @@ using StrBeforeExpression = } const auto& repl = replacement.value(); re2::RE2::GlobalReplace(&in, pat, repl); - return std::move(in); + return toLiteral(in); }; using ReplaceExpression = @@ -352,15 +370,16 @@ class ConcatExpression : public detail::VariadicExpression { std::visit(visitSingleExpressionResult, child->evaluate(ctx)); }); - // Lift the result from `string` to `IdOrString` which is needed for the - // expression module. + // Lift the result from `string` to `IdOrLiteralOrIri` which is needed for + // the expression module. if (std::holds_alternative(result)) { - return IdOrString{std::move(std::get(result))}; + return IdOrLiteralOrIri{toLiteral(std::get(result))}; } else { auto& stringVec = std::get(result); - VectorWithMemoryLimit resultAsVec{ - std::make_move_iterator(stringVec.begin()), - std::make_move_iterator(stringVec.end()), ctx->_allocator}; + VectorWithMemoryLimit resultAsVec(ctx->_allocator); + resultAsVec.reserve(stringVec.size()); + std::ranges::copy(stringVec | std::views::transform(toLiteral), + std::back_inserter(resultAsVec)); return resultAsVec; } } @@ -368,20 +387,13 @@ class ConcatExpression : public detail::VariadicExpression { // ENCODE_FOR_URI [[maybe_unused]] auto encodeForUriImpl = - [](std::optional input) -> IdOrString { + [](std::optional input) -> IdOrLiteralOrIri { if (!input.has_value()) { return Id::makeUndefined(); } else { std::string_view value{input.value()}; - if (value.starts_with("\"")) { - auto contentEnd = ad_utility::findLiteralEnd(value, "\""); - if (contentEnd != 0) { - std::string_view content = value.substr(1, contentEnd - 1); - return boost::urls::encode(content, boost::urls::unreserved_chars); - } - } - return boost::urls::encode(value, boost::urls::unreserved_chars); + return toLiteral(boost::urls::encode(value, boost::urls::unreserved_chars)); } }; using EncodeForUriExpression = diff --git a/src/parser/Literal.cpp b/src/parser/Literal.cpp index dca504da59..a9fb4364e6 100644 --- a/src/parser/Literal.cpp +++ b/src/parser/Literal.cpp @@ -78,7 +78,7 @@ Literal Literal::literalWithoutQuotes( // __________________________________________ Literal Literal::literalWithNormalizedContent( - NormalizedString normalizedRdfContent, + NormalizedStringView normalizedRdfContent, std::optional> descriptor) { auto quotes = "\""sv; auto actualContent = diff --git a/src/parser/Literal.h b/src/parser/Literal.h index 65ac735c86..6e40d4e0c8 100644 --- a/src/parser/Literal.h +++ b/src/parser/Literal.h @@ -25,12 +25,6 @@ class Literal { // Create a new literal without any descriptor explicit Literal(std::string content, size_t beginOfSuffix_); - // Similar to `fromEscapedRdfLiteral`, except the rdfContent is expected to - // already be normalized - static Literal literalWithNormalizedContent( - NormalizedString normalizedRdfContent, - std::optional> descriptor = std::nullopt); - // Internal helper function. Return either the empty string (for a plain // literal), `@langtag` or `^^`. std::string_view getSuffix() const { @@ -79,6 +73,12 @@ class Literal { std::string_view rdfContentWithQuotes, std::optional> descriptor = std::nullopt); + // Similar to `fromEscapedRdfLiteral`, except the rdfContent is expected to + // already be normalized + static Literal literalWithNormalizedContent( + NormalizedStringView normalizedRdfContent, + std::optional> descriptor = std::nullopt); + void addLanguageTag(std::string_view languageTag); void addDatatype(const Iri& datatype); diff --git a/src/parser/LiteralOrIri.h b/src/parser/LiteralOrIri.h index ba34b7c8e9..5a6724a15d 100644 --- a/src/parser/LiteralOrIri.h +++ b/src/parser/LiteralOrIri.h @@ -121,5 +121,12 @@ class LiteralOrIri { // Create a new iri given a prefix iri and its suffix static LiteralOrIri prefixedIri(const Iri& prefix, std::string_view suffix); + + // Printing for GTest + friend void PrintTo(const LiteralOrIri& literalOrIri, std::ostream* os) { + auto& s = *os; + s << literalOrIri.toStringRepresentation(); + } }; + } // namespace ad_utility::triple_component diff --git a/test/AggregateExpressionTest.cpp b/test/AggregateExpressionTest.cpp index 40bb0a3106..0f86fd6f50 100644 --- a/test/AggregateExpressionTest.cpp +++ b/test/AggregateExpressionTest.cpp @@ -6,6 +6,7 @@ #include "./util/GTestHelpers.h" #include "./util/IdTableHelpers.h" #include "./util/IdTestHelpers.h" +#include "./util/TripleComponentTestHelpers.h" #include "engine/ValuesForTesting.h" #include "engine/sparqlExpressions/AggregateExpression.h" #include "gtest/gtest.h" @@ -20,6 +21,10 @@ auto V = VocabId; auto U = Id::makeUndefined(); auto L = LocalVocabId; auto D = DoubleId; +auto lit = [](auto s) { + return IdOrLiteralOrIri( + ad_utility::triple_component::LiteralOrIri(tripleComponentLiteral(s))); +}; static const Id NaN = D(std::numeric_limits::quiet_NaN()); } // namespace @@ -50,9 +55,10 @@ TEST(AggregateExpression, max) { testMaxId({V(7), U, V(2), V(4)}, V(7)); testMaxId({I(3), U, V(0), L(3), U, (I(-1))}, L(3)); - auto testMaxString = testAggregate; + auto testMaxString = testAggregate; // TODO Implement correct comparison on strings - testMaxString({"alpha", "äpfel", "Beta", "unfug"}, "äpfel"); + testMaxString({lit("alpha"), lit("äpfel"), lit("Beta"), lit("unfug")}, + lit("äpfel")); } // ______________________________________________________________________________ @@ -63,9 +69,10 @@ TEST(AggregateExpression, min) { testMinId({V(7), U, V(2), V(4)}, U); testMinId({I(3), V(0), L(3), (I(-1))}, I(-1)); - auto testMinString = testAggregate; + auto testMinString = testAggregate; // TODO Implement correct comparison on strings - testMinString({"alpha", "äpfel", "Beta", "unfug"}, "Beta"); + testMinString({lit("alpha"), lit("äpfel"), lit("Beta"), lit("unfug")}, + lit("Beta")); } // ______________________________________________________________________________ @@ -77,9 +84,8 @@ TEST(AggregateExpression, sum) { testSumId({I(3), U}, U); testSumId({I(3), NaN}, NaN); - auto testSumString = testAggregate; - // TODO The result should be `UNDEF` not `NaN` - testSumString({"alpha", "äpfel", "Beta", "unfug"}, U); + auto testSumString = testAggregate; + testSumString({lit("alpha"), lit("äpfel"), lit("Beta"), lit("unfug")}, U); } // ______________________________________________________________________________ @@ -91,6 +97,6 @@ TEST(AggregateExpression, count) { testCountId({U, I(3), U}, I(1)); testCountId({I(3), NaN, NaN}, I(2), true); - auto testCountString = testAggregate; - testCountString({"alpha", "äpfel", "", "unfug"}, I(4)); + auto testCountString = testAggregate; + testCountString({lit("alpha"), lit("äpfel"), lit(""), lit("unfug")}, I(4)); } diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp index bdba4a9b68..9b2615a737 100644 --- a/test/LocalVocabTest.cpp +++ b/test/LocalVocabTest.cpp @@ -278,8 +278,8 @@ TEST(LocalVocab, propagation) { testQec, {Variable{"?x"}}, {Alias{groupConcatExpression("?y", "|"), Variable{"?concat"}}}, qet(values1)); - checkLocalVocab( - groupBy, std::vector{"", "", "", "yN1|yN2"}); + checkLocalVocab(groupBy, std::vector{"", "", "", + "\"yN1|yN2\""}); // DISTINCT again, but after something has been added to the local vocabulary // (to check that the "y1|y2" added by the GROUP BY does not also appear here, diff --git a/test/RelationalExpressionTest.cpp b/test/RelationalExpressionTest.cpp index bd5e364cca..22be1fa43e 100644 --- a/test/RelationalExpressionTest.cpp +++ b/test/RelationalExpressionTest.cpp @@ -8,6 +8,7 @@ #include "./SparqlExpressionTestHelpers.h" #include "./util/AllocatorTestHelpers.h" #include "./util/GTestHelpers.h" +#include "./util/TripleComponentTestHelpers.h" #include "engine/sparqlExpressions/RelationalExpressions.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -23,6 +24,14 @@ using valueIdComparators::Comparison; // First some internal helper functions and constants. namespace { +auto lit = [](std::string_view s) { + return ad_utility::triple_component::LiteralOrIri(tripleComponentLiteral(s)); +}; + +auto iriref = [](std::string_view s) { + return ad_utility::triple_component::LiteralOrIri(iri(s)); +}; + // Convenient access to constants for "infinity" and "not a number". The // spelling `NaN` was chosen because `nan` conflicts with the standard library. const auto inf = std::numeric_limits::infinity(); @@ -295,30 +304,31 @@ TEST(RelationalExpression, DoubleAndDouble) { } TEST(RelationalExpression, StringAndString) { - testLessThanGreaterThanEqualHelper( - {"alpha", "beta"}, {"sigma", "delta"}, {"epsilon", "epsilon"}); + testLessThanGreaterThanEqualHelper( + {lit("alpha"), lit("beta")}, {lit("sigma"), lit("delta")}, + {lit("epsilon"), lit("epsilon")}); // TODO These tests only work, when we actually use unicode // comparisons for the string based expressions. // TODO Add an example for strings that are bytewise different but // equal on the unicode level (e.g.`ä` vs `a + dots`. - // testLessThanGreaterThanEqualHelper({"Alpha", - // "beta"}, + // testLessThanGreaterThanEqualHelper({"Alpha", "beta"}, // {"beta", "äpfel"}, {"xxx", "xxx"}); } TEST(RelationalExpression, NumericAndStringAreNeverEqual) { - auto stringVec = - VectorWithMemoryLimit({"hallo", "by", ""}, makeAllocator()); + auto stringVec = VectorWithMemoryLimit( + {lit("hallo"), lit("by"), lit("")}, makeAllocator()); auto intVec = VectorWithMemoryLimit({-12365, 0, 12}, makeAllocator()); auto doubleVec = VectorWithMemoryLimit({-12.365, 0, 12.1e5}, makeAllocator()); - testUndefHelper(int64_t{3}, IdOrString{"hallo"}); - testUndefHelper(int64_t{3}, IdOrString{"3"}); - testUndefHelper(-12.0, IdOrString{"hallo"}); - testUndefHelper(-12.0, IdOrString{"-12.0"}); - testUndefHelper(intVec.clone(), IdOrString{"someString"}); - testUndefHelper(doubleVec.clone(), IdOrString{"someString"}); + testUndefHelper(int64_t{3}, IdOrLiteralOrIri{lit("hallo")}); + testUndefHelper(int64_t{3}, IdOrLiteralOrIri{lit("3")}); + testUndefHelper(-12.0, IdOrLiteralOrIri{lit("hallo")}); + testUndefHelper(-12.0, IdOrLiteralOrIri{lit("-12.0")}); + testUndefHelper(intVec.clone(), IdOrLiteralOrIri{lit("someString")}); + testUndefHelper(doubleVec.clone(), IdOrLiteralOrIri{lit("someString")}); testUndefHelper(int64_t{3}, stringVec.clone()); testUndefHelper(intVec.clone(), stringVec.clone()); testUndefHelper(doubleVec.clone(), stringVec.clone()); @@ -535,21 +545,21 @@ TEST(RelationalExpression, NumericConstantAndNumericVector) { } TEST(RelationalExpression, StringConstantsAndStringVector) { - VectorWithMemoryLimit vec( - {"alpha", "alpaka", "bertram", "sigma", "zeta", "kaulquappe", "caesar", - "caesar", "caesar"}, + VectorWithMemoryLimit vec( + {lit("alpha"), lit("alpaka"), lit("bertram"), lit("sigma"), lit("zeta"), + lit("kaulquappe"), lit("caesar"), lit("caesar"), lit("caesar")}, makeAllocator()); - testLessThanGreaterThanEqualMultipleValuesHelper(IdOrString{"caesar"}, - vec.clone()); + testLessThanGreaterThanEqualMultipleValuesHelper( + IdOrLiteralOrIri{lit("caesar")}, vec.clone()); // TODO These tests only work, when we actually use unicode // comparisons for the string based expressions. TODDO Add an example // for strings that are bytewise different but equal on the unicode level // (e.g.`ä` vs `a + dots`. - // VectorWithMemoryLimit vec2({"AlpHa", "älpaka", "Æ", "sigma", - // "Eta", "kaulQuappe", "Caesar", "Caesar", "Caesare"}, alloc); - // testLessThanGreaterThanEqualHelper({"Alpha", - // "beta"}, + // VectorWithMemoryLimit vec2({"AlpHa", "älpaka", "Æ", + // "sigma", "Eta", "kaulQuappe", "Caesar", "Caesar", "Caesare"}, alloc); + // testLessThanGreaterThanEqualHelper({"Alpha", "beta"}, // {"beta", "äpfel"}, {"xxx", "xxx"}); } @@ -592,13 +602,13 @@ TEST(RelationalExpression, DoubleVectorAndIntVector) { } TEST(RelationalExpression, StringVectorAndStringVector) { - VectorWithMemoryLimit vecA{ - {"alpha", "beta", "g", "epsilon", "fraud", "capitalism", "", "bo'sä30", - "Me"}, + VectorWithMemoryLimit vecA{ + {lit("alpha"), lit("beta"), lit("g"), lit("epsilon"), lit("fraud"), + lit("capitalism"), lit(""), lit("bo'sä30"), lit("Me")}, makeAllocator()}; - VectorWithMemoryLimit vecB{ - {"alph", "alpha", "f", "epsiloo", "freud", "communism", "", "bo'sä30", - "Me"}, + VectorWithMemoryLimit vecB{ + {lit("alph"), lit("alpha"), lit("f"), lit("epsiloo"), lit("freud"), + lit("communism"), lit(""), lit("bo'sä30"), lit("Me")}, makeAllocator()}; testLessThanGreaterThanEqualMultipleValuesHelper(vecA.clone(), vecB.clone()); // TODO Add a test case for correct unicode collation as soon as that @@ -656,20 +666,22 @@ TEST(RelationalExpression, VariableAndConstant) { {false, false, true}); // ?vocab column is `"Beta", "alpha", "älpha" - testWithExplicitResult(Variable{"?vocab"}, IdOrString{"\"älpha\""}, + testWithExplicitResult(Variable{"?vocab"}, + IdOrLiteralOrIri{lit("\"älpha\"")}, {false, true, true}); - testWithExplicitResult(Variable{"?vocab"}, IdOrString{"\"alpha\""}, + testWithExplicitResult(Variable{"?vocab"}, + IdOrLiteralOrIri{lit("\"alpha\"")}, {true, false, true}); - testWithExplicitResult(IdOrString{"\"atm\""}, Variable{"?vocab"}, - {true, false, false}); + testWithExplicitResult(IdOrLiteralOrIri{lit("\"atm\"")}, + Variable{"?vocab"}, {true, false, false}); // ?mixed column is `1, -0.1, ` auto U = Id::makeUndefined(); auto B = ad_utility::testing::BoolId; - testWithExplicitIdResult(IdOrString{""}, Variable{"?mixed"}, - {U, U, B(true)}); - testWithExplicitIdResult(IdOrString{""}, Variable{"?mixed"}, - {U, U, B(true)}); + testWithExplicitIdResult(IdOrLiteralOrIri{iriref("")}, + Variable{"?mixed"}, {U, U, B(true)}); + testWithExplicitIdResult(IdOrLiteralOrIri{iriref("")}, + Variable{"?mixed"}, {U, U, B(true)}); // Note: `1` and `` are "not compatible", so even the "not equal" // comparison returns false. @@ -754,7 +766,8 @@ TEST(RelationalExpression, VariableAndConstantBinarySearch) { testSortedVariableAndConstant(ints, int64_t{-1}, {{{0, 3}}}); testSortedVariableAndConstant(ints, 0.3, {{{0, 1}, {2, 3}}}); // ints and strings are always incompatible. - testSortedVariableAndConstant(ints, IdOrString{"a string"}, {}); + testSortedVariableAndConstant(ints, IdOrLiteralOrIri{lit("a string")}, + {}); testSortedVariableAndConstant(doubles, int64_t{0}, {{{0, 2}}}); testSortedVariableAndConstant(doubles, 2.8, {{{1, 2}}}); @@ -764,10 +777,13 @@ TEST(RelationalExpression, VariableAndConstantBinarySearch) { testSortedVariableAndConstant(numeric, 1.0, {{{0, 1}}}); testSortedVariableAndConstant(numeric, 3.4, {{{0, 1}, {2, 3}}}); - testSortedVariableAndConstant(vocab, IdOrString{"\"alpha\""}, {{{1, 3}}}); - testSortedVariableAndConstant(vocab, IdOrString{"\"alpha\""}, {{{0, 3}}}); - testSortedVariableAndConstant(vocab, IdOrString{"\"ball\""}, {{{0, 2}}}); - testSortedVariableAndConstant(vocab, IdOrString{"\"älpha\""}, + testSortedVariableAndConstant(vocab, IdOrLiteralOrIri{lit("\"alpha\"")}, + {{{1, 3}}}); + testSortedVariableAndConstant(vocab, IdOrLiteralOrIri{lit("\"alpha\"")}, + {{{0, 3}}}); + testSortedVariableAndConstant(vocab, IdOrLiteralOrIri{lit("\"ball\"")}, + {{{0, 2}}}); + testSortedVariableAndConstant(vocab, IdOrLiteralOrIri{lit("\"älpha\"")}, {{{0, 1}, {2, 3}}}); testSortedVariableAndConstant(vocab, inf, {}); @@ -778,7 +794,8 @@ TEST(RelationalExpression, VariableAndConstantBinarySearch) { // Note: only *numeric* values that are not equal to 1.0 are considered here. testSortedVariableAndConstant(mixed, 1.0, {{{1, 2}}}); testSortedVariableAndConstant(mixed, -inf, {{{0, 2}}}); - testSortedVariableAndConstant(mixed, IdOrString{""}, {{{2, 3}}}); + testSortedVariableAndConstant(mixed, IdOrLiteralOrIri{iriref("")}, + {{{2, 3}}}); } // TODO We currently do not have tests for the `LocalVocab` case, diff --git a/test/SparqlExpressionTest.cpp b/test/SparqlExpressionTest.cpp index c5c2bd5334..5dee81b2e3 100644 --- a/test/SparqlExpressionTest.cpp +++ b/test/SparqlExpressionTest.cpp @@ -39,7 +39,29 @@ auto Voc = ad_utility::testing::VocabId; auto U = Id::makeUndefined(); using Ids = std::vector; -using IdOrStrings = std::vector; +using IdOrLiteralOrIriVec = std::vector; + +auto lit = [](std::string_view s, std::string_view langtagOrDatatype = "") { + return ad_utility::triple_component::LiteralOrIri( + ad_utility::testing::tripleComponentLiteral(s, langtagOrDatatype)); +}; + +auto iriref = [](std::string_view s) { + return ad_utility::triple_component::LiteralOrIri(iri(s)); +}; + +auto idOrLitOrStringVec = + [](const std::vector>& input) { + IdOrLiteralOrIriVec result; + for (const auto& el : input) { + if (std::holds_alternative(el)) { + result.push_back(std::get(el)); + } else { + result.push_back(lit(std::get(el))); + } + } + return result; + }; // All the helper functions `testUnaryExpression` etc. below internally evaluate // the given expressions using the `TestContext` class, so it is possible to use @@ -78,7 +100,8 @@ SingleExpressionResult auto toExpressionResult(T vec) { if constexpr (SingleExpressionResult) { return vec; } else if constexpr (std::convertible_to) { - return IdOrString{std::string{std::string_view{vec}}}; + // TODO Make a generic testing utility for the string case... + return IdOrLiteralOrIri{lit(vec)}; } else { return VectorWithMemoryLimit{ std::make_move_iterator(vec.begin()), @@ -107,16 +130,16 @@ ::testing::Matcher matchId(Id id) { } // Return a matcher that matches a result of an expression that is not a vector. -// If it is an ID (possibly contained in the `IdOrString` variant), then the -// `matchId` matcher from above is used, else we test for equality. +// If it is an ID (possibly contained in the `IdOrLiteralOrIri` variant), then +// the `matchId` matcher from above is used, else we test for equality. template requires(!isVectorResult) auto nonVectorResultMatcher(const T& expected) { if constexpr (std::is_same_v) { return matchId(expected); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { return std::visit( [](const U& el) { - return ::testing::MatcherCast( + return ::testing::MatcherCast( ::testing::VariantWith(nonVectorResultMatcher(el))); }, expected); @@ -256,7 +279,7 @@ TEST(SparqlExpression, logicalOperators) { alloc}; V dAsBool{{B(true), B(true), B(false), B(false)}, alloc}; - V s{{"true", "", "false", ""}, alloc}; + V s{{lit("true"), lit(""), lit("false"), lit("")}, alloc}; V sAsBool{{B(true), B(false), B(true), B(false)}, alloc}; V i{{I(32), I(-42), I(0), I(5)}, alloc}; @@ -325,10 +348,10 @@ TEST(SparqlExpression, logicalOperators) { testAnd(allFalse, b, D(nan)); testAnd(b, b, D(2839.123)); - testOr(allTrue, b, IdOrString{"halo"}); - testOr(b, b, IdOrString("")); - testAnd(allFalse, b, IdOrString("")); - testAnd(b, b, IdOrString("yellow")); + testOr(allTrue, b, IdOrLiteralOrIri{lit("halo")}); + testOr(b, b, IdOrLiteralOrIri(lit(""))); + testAnd(allFalse, b, IdOrLiteralOrIri(lit(""))); + testAnd(b, b, IdOrLiteralOrIri(lit("yellow"))); // Test the behavior in the presence of UNDEF values. Id t = B(true); @@ -492,7 +515,7 @@ TEST(SparqlExpression, dateOperators) { auto testYear = testUnaryExpression<&makeYearExpression>; testYear(Ids{Id::makeFromDouble(42.0)}, Ids{U}); testYear(Ids{Id::makeFromBool(false)}, Ids{U}); - testYear(IdOrStrings{"noDate"}, Ids{U}); + testYear(IdOrLiteralOrIriVec{lit("noDate")}, Ids{U}); } // _____________________________________________________________________________________ @@ -504,22 +527,28 @@ static auto makeStrlenWithStr = [](auto arg) { auto checkStrlenWithStrChild = testUnaryExpression; TEST(SparqlExpression, stringOperators) { // Test `StrlenExpression` and `StrExpression`. - checkStrlen(IdOrStrings{"one", "two", "three", ""}, - Ids{I(3), I(3), I(5), I(0)}); - checkStrlenWithStrChild(IdOrStrings{"one", "two", "three", ""}, - Ids{I(3), I(3), I(5), I(0)}); + checkStrlen( + IdOrLiteralOrIriVec{lit("one"), lit("two"), lit("three"), lit("")}, + Ids{I(3), I(3), I(5), I(0)}); + checkStrlenWithStrChild( + IdOrLiteralOrIriVec{lit("one"), lit("two"), lit("three"), lit("")}, + Ids{I(3), I(3), I(5), I(0)}); // Test the different (optimized) behavior depending on whether the STR() // function was applied to the argument. - checkStrlen(IdOrStrings{"one", I(1), D(3.6), ""}, Ids{I(3), U, U, I(0)}); - checkStrlenWithStrChild(IdOrStrings{"one", I(1), D(3.6), ""}, - Ids{I(3), I(1), I(3), I(0)}); - checkStr(Ids{I(1), I(2), I(3)}, IdOrStrings{"1", "2", "3"}); - checkStr(Ids{D(-1.0), D(1.0), D(2.34)}, IdOrStrings{"-1", "1", "2.34"}); + checkStrlen(IdOrLiteralOrIriVec{lit("one"), I(1), D(3.6), lit("")}, + Ids{I(3), U, U, I(0)}); + checkStrlenWithStrChild( + IdOrLiteralOrIriVec{lit("one"), I(1), D(3.6), lit("")}, + Ids{I(3), I(1), I(3), I(0)}); + checkStr(Ids{I(1), I(2), I(3)}, + IdOrLiteralOrIriVec{lit("1"), lit("2"), lit("3")}); + checkStr(Ids{D(-1.0), D(1.0), D(2.34)}, + IdOrLiteralOrIriVec{lit("-1"), lit("1"), lit("2.34")}); checkStr(Ids{B(true), B(false), B(true)}, - IdOrStrings{"true", "false", "true"}); - checkStr(IdOrStrings{"one", "two", "three"}, - IdOrStrings{"one", "two", "three"}); + IdOrLiteralOrIriVec{lit("true"), lit("false"), lit("true")}); + checkStr(IdOrLiteralOrIriVec{lit("one"), lit("two"), lit("three")}, + IdOrLiteralOrIriVec{lit("one"), lit("two"), lit("three")}); // A simple test for uniqueness of the cache key. auto c1a = makeStrlenExpression(std::make_unique(iri(""))) @@ -552,10 +581,10 @@ TEST(SparqlExpression, stringOperators) { auto checkUcase = testUnaryExpression<&makeUppercaseExpression>; auto checkLcase = testUnaryExpression<&makeLowercaseExpression>; TEST(SparqlExpression, uppercaseAndLowercase) { - checkLcase(IdOrStrings{"One", "tWÖ", U, I(12)}, - IdOrStrings{"one", "twö", U, U}); - checkUcase(IdOrStrings{"One", "tWÖ", U, I(12)}, - IdOrStrings{"ONE", "TWÖ", U, U}); + checkLcase(IdOrLiteralOrIriVec{lit("One"), lit("tWÖ"), U, I(12)}, + IdOrLiteralOrIriVec{lit("one"), lit("twö"), U, U}); + checkUcase(IdOrLiteralOrIriVec{lit("One"), lit("tWÖ"), U, I(12)}, + IdOrLiteralOrIriVec{lit("ONE"), lit("TWÖ"), U, U}); } // _____________________________________________________________________________________ @@ -570,35 +599,55 @@ auto checkStrBefore = std::bind_front(testNaryExpression, &makeStrBeforeExpression); TEST(SparqlExpression, binaryStringOperations) { // Test STRSTARTS, STRENDS, CONTAINS, STRBEFORE, and STRAFTER. - using S = IdOrStrings; auto F = Id::makeFromBool(false); auto T = Id::makeFromBool(true); + auto S = [](const std::vector>& input) { + IdOrLiteralOrIriVec result; + for (const auto& el : input) { + if (std::holds_alternative(el)) { + result.push_back(std::get(el)); + } else { + result.push_back(lit(std::get(el))); + } + } + return result; + }; checkStrStarts( Ids{T, F, T, F, T, T, F, F, F}, - S{"", "", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo"}, - S{"", "x", "", "Hallo", "Häl", "Hällo", "Hällox", "ll", "lo"}); - checkStrEnds( - Ids{T, F, T, F, T, T, F, F, F}, - S{"", "", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo"}, - S{"", "x", "", "Hallo", "o", "Hällo", "Hällox", "ll", "H"}); + S({"", "", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo", + "Hällo"}), + S({"", "x", "", "Hallo", "Häl", "Hällo", "Hällox", "ll", "lo"})); + checkStrEnds(Ids{T, F, T, F, T, T, F, F, F}, + S({"", "", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo", + "Hällo"}), + S({"", "x", "", "Hallo", "o", "Hällo", "Hällox", "ll", "H"})); checkContains(Ids{T, F, T, F, T, T, F}, - S{"", "", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo"}, - S{"", "x", "", "ullo", "ll", "Hällo", "Hällox"}); - checkStrAfter(S{"", "", "Hällo", "", "o", "", "", "lo"}, - S{"", "", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo"}, - S{"", "x", "", "ullo", "ll", "Hällo", "Hällox", "l"}); + S({"", "", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo"}), + S({"", "x", "", "ullo", "ll", "Hällo", "Hällox"})); + checkStrAfter( + S({"", "", "Hällo", "", "o", "", "", "lo"}), + S({"", "", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo"}), + S({"", "x", "", "ullo", "ll", "Hällo", "Hällox", "l"})); checkStrBefore( - S{"", "", "Hällo", "", "Hä", "", "", "Hä"}, - S{"", "", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo"}, - S{"", "x", "", "ullo", "ll", "Hällo", "Hällox", "l"}); + S({"", "", "Hällo", "", "Hä", "", "", "Hä"}), + S({"", "", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo", "Hällo"}), + S({"", "x", "", "ullo", "ll", "Hällo", "Hällox", "l"})); } // ______________________________________________________________________________ static auto checkSubstr = std::bind_front(testNaryExpression, makeSubstrExpression); TEST(SparqlExpression, substr) { - auto strs = [](const IdOrStrings& input) { - return VectorWithMemoryLimit(input.begin(), input.end(), alloc); + auto strs = [](const std::vector>& input) { + VectorWithMemoryLimit result(alloc); + for (const auto& el : input) { + if (std::holds_alternative(el)) { + result.push_back(std::get(el)); + } else { + result.push_back(lit(std::get(el))); + } + } + return result; }; // Remember: The start position (the second argument to the SUBSTR expression) @@ -642,15 +691,17 @@ TEST(SparqlExpression, substr) { // Invalid datatypes // First must be string. - auto Ux = IdOrString{U}; + auto Ux = IdOrLiteralOrIri{U}; checkSubstr(Ux, I(3), I(4), I(7)); checkSubstr(Ux, U, I(4), I(7)); checkSubstr(Ux, Ux, I(4), I(7)); // Second and third must be numeric; - checkSubstr(Ux, IdOrString{"hello"}, U, I(4)); - checkSubstr(Ux, IdOrString{"hello"}, IdOrString{"bye"}, I(17)); - checkSubstr(Ux, IdOrString{"hello"}, I(4), U); - checkSubstr(Ux, IdOrString{"hello"}, I(4), IdOrString{"bye"}); + checkSubstr(Ux, IdOrLiteralOrIri{lit("hello")}, U, I(4)); + checkSubstr(Ux, IdOrLiteralOrIri{lit("hello")}, IdOrLiteralOrIri{lit("bye")}, + I(17)); + checkSubstr(Ux, IdOrLiteralOrIri{lit("hello")}, I(4), U); + checkSubstr(Ux, IdOrLiteralOrIri{lit("hello")}, I(4), + IdOrLiteralOrIri{lit("bye")}); } // _____________________________________________________________________________________ @@ -662,7 +713,7 @@ TEST(SparqlExpression, unaryNegate) { Ids{B(true), B(false), I(0), I(3), D(0), D(12), D(naN), U}, Ids{B(false), B(true), B(true), B(false), B(true), B(false), B(true), U}); // Empty strings are considered to be true. - checkNegate(IdOrStrings{"true", "false", "", "blibb"}, + checkNegate(idOrLitOrStringVec({"true", "false", "", "blibb"}), Ids{B(false), B(false), B(true), B(false)}); } @@ -673,7 +724,8 @@ TEST(SparqlExpression, unaryMinus) { checkMinus( Ids{B(true), B(false), I(0), I(3), D(0), D(12.8), D(naN), U, Voc(6)}, Ids{I(-1), I(0), I(0), I(-3), D(-0.0), D(-12.8), D(-naN), U, U}); - checkMinus(IdOrStrings{"true", "false", "", ""}, Ids{U, U, U, U}); + checkMinus(idOrLitOrStringVec({"true", "false", "", ""}), + Ids{U, U, U, U}); } // _____________________________________________________________________________________ @@ -738,8 +790,9 @@ TEST(SparqlExpression, isSomethingFunctions) { Id blank2 = Id::makeFromBlankNodeIndex(BlankNodeIndex::make(12)); Id localLiteral = testContext().notInVocabA; - IdOrStrings testIdOrStrings{"", "\"l\"", blank2, iri, literal, - blank, localLiteral, I(42), D(1), U}; + IdOrLiteralOrIriVec testIdOrStrings = + IdOrLiteralOrIriVec{iriref(""), lit("\"l\""), blank2, iri, literal, + blank, localLiteral, I(42), D(1), U}; testUnaryExpression(testIdOrStrings, Ids{T, F, F, T, F, F, F, F, F, F}); testUnaryExpression(testIdOrStrings, @@ -758,16 +811,20 @@ TEST(SparqlExpression, geoSparqlExpressions) { auto checkLong = testUnaryExpression<&makeLongitudeExpression>; auto checkDist = std::bind_front(testNaryExpression, &makeDistExpression); - checkLat(IdOrStrings{"POINT(24.3 26.8)", "NotAPoint", I(12)}, + checkLat(idOrLitOrStringVec({"POINT(24.3 26.8)", "NotAPoint", I(12)}), Ids{D(26.8), U, U}); - checkLong(IdOrStrings{D(4.2), "POINT(24.3 26.8)", "NotAPoint"}, + checkLong(idOrLitOrStringVec({D(4.2), "POINT(24.3 26.8)", "NotAPoint"}), Ids{U, D(24.3), U}); - checkDist(D(0.0), IdOrString{"POINT(24.3 26.8)"}, - IdOrString{"POINT(24.3 26.8)"}); - checkDist(U, IdOrString{"POINT(24.3 26.8)"}, IdOrString{I(12)}); - checkDist(U, IdOrString{I(12)}, IdOrString{"POINT(24.3 26.8)"}); - checkDist(U, IdOrString{"POINT(24.3 26.8)"s}, IdOrString{"NotAPoint"}); - checkDist(U, IdOrString{"NotAPoint"}, IdOrString{"POINT(24.3 26.8)"}); + checkDist(D(0.0), IdOrLiteralOrIri{lit("POINT(24.3 26.8)")}, + IdOrLiteralOrIri{lit("POINT(24.3 26.8)")}); + checkDist(U, IdOrLiteralOrIri{lit("POINT(24.3 26.8)")}, + IdOrLiteralOrIri{I(12)}); + checkDist(U, IdOrLiteralOrIri{I(12)}, + IdOrLiteralOrIri{lit("POINT(24.3 26.8)")}); + checkDist(U, IdOrLiteralOrIri{lit("POINT(24.3 26.8)"s)}, + IdOrLiteralOrIri{lit("NotAPoint")}); + checkDist(U, IdOrLiteralOrIri{lit("NotAPoint")}, + IdOrLiteralOrIri{lit("POINT(24.3 26.8)")}); } // ________________________________________________________________________________________ @@ -778,29 +835,31 @@ TEST(SparqlExpression, ifAndCoalesce) { const auto T = Id::makeFromBool(true); const auto F = Id::makeFromBool(false); - checkIf( - IdOrStrings{I(0), "eins", I(2), I(3), "vier", "fünf"}, + checkIf(idOrLitOrStringVec({I(0), "eins", I(2), I(3), "vier", "fünf"}), + // UNDEF and the empty string are considered to be `false`. + std::tuple{idOrLitOrStringVec({T, F, T, "true", U, ""}), + Ids{I(0), I(1), I(2), I(3), I(4), I(5)}, + idOrLitOrStringVec( + {"null", "eins", "zwei", "drei", "vier", "fünf"})}); + checkCoalesce( + idOrLitOrStringVec({I(0), "eins", I(2), I(3), U, D(5.0)}), // UNDEF and the empty string are considered to be `false`. - std::tuple{IdOrStrings{T, F, T, "true", U, ""}, - Ids{I(0), I(1), I(2), I(3), I(4), I(5)}, - IdOrStrings{"null", "eins", "zwei", "drei", "vier", "fünf"}}); - checkCoalesce(IdOrStrings{I(0), "eins", I(2), I(3), U, D(5.0)}, - // UNDEF and the empty string are considered to be `false`. - std::tuple{Ids{I(0), U, I(2), I(3), U, D(5.0)}, - IdOrStrings{"null", "eins", "zwei", "drei", U, U}, - Ids{U, U, U, U, U, D(5.0)}}); + std::tuple{Ids{I(0), U, I(2), I(3), U, D(5.0)}, + idOrLitOrStringVec({"null", "eins", "zwei", "drei", U, U}), + Ids{U, U, U, U, U, D(5.0)}}); // Example for COALESCE where we have a constant input and evaluating the last // input is not necessary. - checkCoalesce(IdOrStrings{I(0), "eins", I(2), I(3), "eins", D(5.0)}, - // UNDEF and the empty string are considered to be `false`. - std::tuple{Ids{I(0), U, I(2), I(3), U, D(5.0)}, U, - IdOrString{"eins"}, Ids{U, U, U, U, U, D(5.0)}}); + checkCoalesce( + idOrLitOrStringVec({I(0), "eins", I(2), I(3), "eins", D(5.0)}), + // UNDEF and the empty string are considered to be `false`. + std::tuple{Ids{I(0), U, I(2), I(3), U, D(5.0)}, U, + IdOrLiteralOrIri{lit("eins")}, Ids{U, U, U, U, U, D(5.0)}}); // Check COALESCE with no arguments or empty arguments. - checkCoalesce(IdOrStrings{}, std::tuple{}); - checkCoalesce(IdOrStrings{}, std::tuple{Ids{}}); - checkCoalesce(IdOrStrings{}, std::tuple{Ids{}, Ids{}}); - checkCoalesce(IdOrStrings{}, std::tuple{Ids{}, Ids{}, Ids{}}); + checkCoalesce(IdOrLiteralOrIriVec{}, std::tuple{}); + checkCoalesce(IdOrLiteralOrIriVec{}, std::tuple{Ids{}}); + checkCoalesce(IdOrLiteralOrIriVec{}, std::tuple{Ids{}, Ids{}}); + checkCoalesce(IdOrLiteralOrIriVec{}, std::tuple{Ids{}, Ids{}, Ids{}}); auto coalesceExpr = makeCoalesceExpressionVariadic( std::make_unique(iri("")), @@ -816,25 +875,28 @@ TEST(SparqlExpression, concatExpression) { auto checkConcat = testNaryExpressionVec; const auto T = Id::makeFromBool(true); - checkConcat(IdOrStrings{"0null", "eins", "2zwei", "3drei", "", "5.35.2"}, - // UNDEF evaluates to an empty string.. - std::tuple{Ids{I(0), U, I(2), I(3), U, D(5.3)}, - IdOrStrings{"null", "eins", "zwei", "drei", U, U}, - Ids{U, U, U, U, U, D(5.2)}}); + checkConcat( + idOrLitOrStringVec({"0null", "eins", "2zwei", "3drei", "", "5.35.2"}), + // UNDEF evaluates to an empty string.. + std::tuple{Ids{I(0), U, I(2), I(3), U, D(5.3)}, + idOrLitOrStringVec({"null", "eins", "zwei", "drei", U, U}), + Ids{U, U, U, U, U, D(5.2)}}); // Example with some constants in the middle. - checkConcat(IdOrStrings{"0trueeins", "trueeins", "2trueeins", "3trueeins", - "trueeins", "12.3trueeins-2.1"}, - // UNDEF and the empty string are considered to be `false`. - std::tuple{Ids{I(0), U, I(2), I(3), U, D(12.3)}, T, - IdOrString{"eins"}, Ids{U, U, U, U, U, D(-2.1)}}); + checkConcat( + idOrLitOrStringVec({"0trueeins", "trueeins", "2trueeins", "3trueeins", + "trueeins", "12.3trueeins-2.1"}), + // UNDEF and the empty string are considered to be `false`. + std::tuple{Ids{I(0), U, I(2), I(3), U, D(12.3)}, T, + IdOrLiteralOrIri{lit("eins")}, Ids{U, U, U, U, U, D(-2.1)}}); // Only constants - checkConcat(IdOrString{"trueMe1"}, std::tuple{T, IdOrString{"Me"}, I(1)}); + checkConcat(IdOrLiteralOrIri{lit("trueMe1")}, + std::tuple{T, IdOrLiteralOrIri{lit("Me")}, I(1)}); // Constants at the beginning. - checkConcat(IdOrStrings{"trueMe1", "trueMe2"}, - std::tuple{T, IdOrString{"Me"}, Ids{I(1), I(2)}}); + checkConcat(IdOrLiteralOrIriVec{lit("trueMe1"), lit("trueMe2")}, + std::tuple{T, IdOrLiteralOrIri{lit("Me")}, Ids{I(1), I(2)}}); - checkConcat(IdOrString{""}, std::tuple{}); + checkConcat(IdOrLiteralOrIri{lit("")}, std::tuple{}); auto coalesceExpr = makeConcatExpressionVariadic( std::make_unique(iri("")), std::make_unique(iri(""))); @@ -848,36 +910,43 @@ TEST(SparqlExpression, concatExpression) { TEST(SparqlExpression, ReplaceExpression) { auto checkReplace = testNaryExpressionVec<&makeReplaceExpression>; // A simple replace( no regexes involved). - checkReplace(IdOrStrings{"null", "Eins", "zwEi", "drEi", U, U}, - std::tuple{IdOrStrings{"null", "eins", "zwei", "drei", U, U}, - IdOrString{"e"}, IdOrString{"E"}}); + checkReplace( + idOrLitOrStringVec({"null", "Eins", "zwEi", "drEi", U, U}), + std::tuple{idOrLitOrStringVec({"null", "eins", "zwei", "drei", U, U}), + IdOrLiteralOrIri{lit("e")}, IdOrLiteralOrIri{lit("E")}}); // A somewhat more involved regex - checkReplace(IdOrStrings{"null", "Xs", "zwei", "drei", U, U}, - std::tuple{IdOrStrings{"null", "eins", "zwei", "drei", U, U}, - IdOrString{"e.[a-z]"}, IdOrString{"X"}}); + checkReplace( + idOrLitOrStringVec({"null", "Xs", "zwei", "drei", U, U}), + std::tuple{idOrLitOrStringVec({"null", "eins", "zwei", "drei", U, U}), + IdOrLiteralOrIri{lit("e.[a-z]")}, IdOrLiteralOrIri{lit("X")}}); // Case-insensitive matching using the hack for google regex: - checkReplace(IdOrStrings{"null", "xxns", "zwxx", "drxx"}, - std::tuple{IdOrStrings{"null", "eIns", "zwEi", "drei"}, - IdOrString{"(?i)[ei]"}, IdOrString{"x"}}); + checkReplace(idOrLitOrStringVec({"null", "xxns", "zwxx", "drxx"}), + std::tuple{idOrLitOrStringVec({"null", "eIns", "zwEi", "drei"}), + IdOrLiteralOrIri{lit("(?i)[ei]")}, + IdOrLiteralOrIri{lit("x")}}); // Multiple matches withing the same string checkReplace( - IdOrString{"wEeDEflE"}, - std::tuple{IdOrString{"weeeDeeflee"}, IdOrString{"ee"}, IdOrString{"E"}}); + IdOrLiteralOrIri{lit("wEeDEflE")}, + std::tuple{IdOrLiteralOrIri{lit("weeeDeeflee")}, + IdOrLiteralOrIri{lit("ee")}, IdOrLiteralOrIri{lit("E")}}); // Illegal regex. - checkReplace(IdOrStrings{U, U, U, U, U, U}, - std::tuple{IdOrStrings{"null", "Xs", "zwei", "drei", U, U}, - IdOrString{"e.[a-z"}, IdOrString{"X"}}); + checkReplace( + IdOrLiteralOrIriVec{U, U, U, U, U, U}, + std::tuple{idOrLitOrStringVec({"null", "Xs", "zwei", "drei", U, U}), + IdOrLiteralOrIri{lit("e.[a-z")}, IdOrLiteralOrIri{lit("X")}}); // Undefined regex - checkReplace(IdOrStrings{U, U, U, U, U, U}, - std::tuple{IdOrStrings{"null", "Xs", "zwei", "drei", U, U}, U, - IdOrString{"X"}}); + checkReplace( + IdOrLiteralOrIriVec{U, U, U, U, U, U}, + std::tuple{idOrLitOrStringVec({"null", "Xs", "zwei", "drei", U, U}), U, + IdOrLiteralOrIri{lit("X")}}); // Illegal replacement. - checkReplace(IdOrStrings{U, U, U, U, U, U}, - std::tuple{IdOrStrings{"null", "Xs", "zwei", "drei", U, U}, - IdOrString{"e"}, Id::makeUndefined()}); + checkReplace( + IdOrLiteralOrIriVec{U, U, U, U, U, U}, + std::tuple{idOrLitOrStringVec({"null", "Xs", "zwei", "drei", U, U}), + IdOrLiteralOrIri{lit("e")}, Id::makeUndefined()}); } // ______________________________________________________________________________ @@ -887,15 +956,16 @@ TEST(SparqlExpression, literalExpression) { "\"notInTheVocabulary\"")}; // Evaluate multiple times to test the caching behavior. for (size_t i = 0; i < 15; ++i) { - ASSERT_EQ((ExpressionResult{IdOrString{"\"notInTheVocabulary\""}}), - expr.evaluate(&ctx.context)); + ASSERT_EQ( + (ExpressionResult{IdOrLiteralOrIri{lit("\"notInTheVocabulary\"")}}), + expr.evaluate(&ctx.context)); } // A similar test with a constant entry that is part of the vocabulary and can // therefore be converted to an ID. IriExpression iriExpr{TripleComponent::Iri::fromIriref("")}; Id idOfX = ctx.x; for (size_t i = 0; i < 15; ++i) { - ASSERT_EQ((ExpressionResult{IdOrString{idOfX}}), + ASSERT_EQ((ExpressionResult{IdOrLiteralOrIri{idOfX}}), iriExpr.evaluate(&ctx.context)); } } @@ -903,13 +973,16 @@ TEST(SparqlExpression, literalExpression) { // ______________________________________________________________________________ TEST(SparqlExpression, encodeForUri) { auto checkEncodeForUri = testUnaryExpression<&makeEncodeForUriExpression>; - using IoS = IdOrString; + using IoS = IdOrLiteralOrIri; + auto l = [](std::string_view s, std::string_view langOrDatatype = "") { + return IoS{lit(s, langOrDatatype)}; + }; checkEncodeForUri("Los Angeles", "Los%20Angeles"); - checkEncodeForUri("\"Los Angeles\"@en", "Los%20Angeles"); - checkEncodeForUri("\"Los Angeles\"^^xsd:string", "Los%20Angeles"); - checkEncodeForUri("\"Los \\\"Angeles\"^^xsd:string", "Los%20%5C%22Angeles"); - checkEncodeForUri("\"Los Angeles", "%22Los%20Angeles"); - checkEncodeForUri("L\"os \"Angeles", "L%22os%20%22Angeles"); + checkEncodeForUri(l("Los Angeles", "@en"), "Los%20Angeles"); + checkEncodeForUri(l("Los Angeles", "^^"), "Los%20Angeles"); + checkEncodeForUri(l("Los Ängeles", "^^"), "Los%20%C3%84ngeles"); + checkEncodeForUri(l("\"Los Angeles"), "%22Los%20Angeles"); + checkEncodeForUri(l("L\"os \"Angeles"), "L%22os%20%22Angeles"); // Literals from the global and local vocab. checkEncodeForUri(testContext().aelpha, "%C3%A4lpha"); checkEncodeForUri(testContext().notInVocabA, "notInVocabA"); diff --git a/test/SparqlExpressionTypesTest.cpp b/test/SparqlExpressionTypesTest.cpp index af2bc66f99..a05d650185 100644 --- a/test/SparqlExpressionTypesTest.cpp +++ b/test/SparqlExpressionTypesTest.cpp @@ -29,13 +29,15 @@ TEST(SparqlExpressionTypes, expressionResult) { } TEST(SparqlExpressionTypes, printIdOrString) { + using namespace ad_utility::triple_component; std::stringstream str; - IdOrString idOrString{Id::makeUndefined()}; + + IdOrLiteralOrIri idOrString{Id::makeUndefined()}; PrintTo(idOrString, &str); ASSERT_EQ(str.str(), "Undefined:Undefined"); - idOrString = "bimm"; + idOrString = LiteralOrIri::literalWithoutQuotes("bimm"); // Clear the stringstream. str.str({}); PrintTo(idOrString, &str); - ASSERT_EQ(str.str(), "bimm"); + ASSERT_EQ(str.str(), "\"bimm\""); } diff --git a/test/StringUtilsTest.cpp b/test/StringUtilsTest.cpp index 2aca0783dd..5c4763f926 100644 --- a/test/StringUtilsTest.cpp +++ b/test/StringUtilsTest.cpp @@ -313,3 +313,11 @@ TEST(StringUtilsTest, insertThousandSeparator) { forbiddenSymbolTest .template operator()<'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'>(); } + +TEST(StringUtilsTest, findLiteralEnd) { + using namespace ad_utility; + EXPECT_EQ(findLiteralEnd("nothing", "\""), std::string_view::npos); + EXPECT_EQ(findLiteralEnd("no\"thing", "\""), 2u); + EXPECT_EQ(findLiteralEnd("no\\\"thi\"ng", "\""), 7u); + EXPECT_EQ(findLiteralEnd("no\\\\\"thing", "\""), 4u); +} diff --git a/test/parser/LiteralOrIriTest.cpp b/test/parser/LiteralOrIriTest.cpp index 5afcc5e8d5..0c5486fc7b 100644 --- a/test/parser/LiteralOrIriTest.cpp +++ b/test/parser/LiteralOrIriTest.cpp @@ -10,6 +10,7 @@ #include "parser/Literal.h" #include "parser/LiteralOrIri.h" #include "parser/NormalizedString.h" +#include "util/HashSet.h" using namespace ad_utility::triple_component; @@ -194,3 +195,17 @@ TEST(LiteralOrIri, EnsureLiteralsAreEncoded) { EXPECT_THAT(R"(This is to be "\ encoded)", asStringViewUnsafe(literal2.getContent())); } + +TEST(LiteralOrIri, Printing) { + LiteralOrIri literal1 = LiteralOrIri::literalWithoutQuotes("hallo"); + std::stringstream str; + PrintTo(literal1, &str); + EXPECT_EQ(str.str(), "\"hallo\""); +} + +TEST(LiteralOrIri, Hashing) { + auto lit = LiteralOrIri::literalWithoutQuotes("bimbamm"); + auto iri = LiteralOrIri::iriref(""); + ad_utility::HashSet set{lit, iri}; + EXPECT_THAT(set, ::testing::UnorderedElementsAre(lit, iri)); +} diff --git a/test/util/TripleComponentTestHelpers.h b/test/util/TripleComponentTestHelpers.h index 5c108fe8bc..dcaad52808 100644 --- a/test/util/TripleComponentTestHelpers.h +++ b/test/util/TripleComponentTestHelpers.h @@ -13,7 +13,12 @@ namespace ad_utility::testing { // `literal` (which must be enclosed in double quotes) and the optional // `langtagOrDatatype` (which must start with `@` or `^^`). constexpr auto tripleComponentLiteral = - [](const std::string& literal, std::string_view langtagOrDatatype = "") { + [](std::string_view literal, std::string_view langtagOrDatatype = "") { + std::string dummy; + if (!(literal.starts_with('"') && literal.ends_with('"'))) { + dummy = absl::StrCat("\"", literal, "\""); + literal = dummy; + } if (langtagOrDatatype.starts_with("@")) { return TripleComponent::Literal::fromEscapedRdfLiteral( literal, std::string(langtagOrDatatype)); From c339978cd457f96cacea4027c89a587e06e6fd9b Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 18 Apr 2024 00:43:16 +0200 Subject: [PATCH 2/5] Only store offsets, no longer indices, for the external vocabulary (#1320) Exploit that the vocab indices in our external vocabulary are contiguous, by now storing only the offset of each word in `.vocabulary.external.words`, and not, as we did so far, also its vocab index. This halves the size of the former `vocabulary.external.words.idsAndOffsets` file, which is now called `.vocabulary.external.words.offsets`. Moreover, to access a word by its vocab index, we don't need a binary search anymore. This change breaks the index format! Note: We could save further space by storing each offset in 48 bits, instead of in 64 bits. Even further compression is possible. --- src/index/IndexFormatVersion.h | 2 +- src/index/VocabularyOnDisk.cpp | 91 ++++++++++------------------------ src/index/VocabularyOnDisk.h | 59 ++++------------------ test/VocabularyOnDiskTest.cpp | 24 --------- test/util/IndexTestHelpers.cpp | 2 +- 5 files changed, 40 insertions(+), 138 deletions(-) diff --git a/src/index/IndexFormatVersion.h b/src/index/IndexFormatVersion.h index 101134ce73..3c1fbf4260 100644 --- a/src/index/IndexFormatVersion.h +++ b/src/index/IndexFormatVersion.h @@ -36,5 +36,5 @@ struct IndexFormatVersion { // The actual index version. Change it once the binary format of the index // changes. inline const IndexFormatVersion& indexFormatVersion{ - 1296, DateOrLargeYear{Date{2024, 3, 8}}}; + 1320, DateOrLargeYear{Date{2024, 4, 17}}}; } // namespace qlever diff --git a/src/index/VocabularyOnDisk.cpp b/src/index/VocabularyOnDisk.cpp index 51ec204806..a16838be95 100644 --- a/src/index/VocabularyOnDisk.cpp +++ b/src/index/VocabularyOnDisk.cpp @@ -12,35 +12,19 @@ using OffsetAndSize = VocabularyOnDisk::OffsetAndSize; // ____________________________________________________________________________ -std::optional VocabularyOnDisk::getOffsetAndSize( - uint64_t idx) const { - IndexAndOffset idAndDummyOffset{idx, 0}; - auto it = std::ranges::lower_bound(idsAndOffsets_, idAndDummyOffset); - if (it >= idsAndOffsets_.end() - 1 || it->idx_ != idx) { - return std::nullopt; - } - return getOffsetAndSizeForIthElement(it - idsAndOffsets_.begin()); -} - -// ____________________________________________________________________________ -VocabularyOnDisk::OffsetSizeId VocabularyOnDisk::getOffsetSizeIdForIthElement( - uint64_t i) const { - AD_CONTRACT_CHECK(i < size()); - const auto offset = idsAndOffsets_[i].offset_; - const auto nextOffset = idsAndOffsets_[i + 1].offset_; - return OffsetSizeId{offset, nextOffset - offset, idsAndOffsets_[i].idx_}; +OffsetAndSize VocabularyOnDisk::getOffsetAndSize(uint64_t i) const { + AD_CORRECTNESS_CHECK(i < size()); + const auto offset = offsets_[i]; + const auto nextOffset = offsets_[i + 1]; + return {offset, nextOffset - offset}; } // _____________________________________________________________________________ -std::optional VocabularyOnDisk::operator[](uint64_t idx) const { - auto optionalOffsetAndSize = getOffsetAndSize(idx); - if (!optionalOffsetAndSize.has_value()) { - return std::nullopt; - } - - string result(optionalOffsetAndSize->_size, '\0'); - file_.read(result.data(), optionalOffsetAndSize->_size, - optionalOffsetAndSize->_offset); +std::string VocabularyOnDisk::operator[](uint64_t idx) const { + AD_CONTRACT_CHECK(idx < size()); + auto offsetAndSize = getOffsetAndSize(idx); + string result(offsetAndSize._size, '\0'); + file_.read(result.data(), offsetAndSize._size, offsetAndSize._offset); return result; } @@ -50,27 +34,22 @@ void VocabularyOnDisk::buildFromIterable(Iterable&& it, const string& fileName) { { file_.open(fileName.c_str(), "w"); - ad_utility::MmapVector idsAndOffsets( - fileName + offsetSuffix_, ad_utility::CreateTag{}); + ad_utility::MmapVector offsets(fileName + offsetSuffix_, + ad_utility::CreateTag{}); uint64_t currentOffset = 0; - std::optional previousId = std::nullopt; + uint64_t nextId = 0; for (const auto& [word, id] : it) { - AD_CONTRACT_CHECK(!previousId.has_value() || previousId.value() < id); - idsAndOffsets.push_back(IndexAndOffset{id, currentOffset}); + AD_CONTRACT_CHECK(nextId == id); + ++nextId; + offsets.push_back(currentOffset); currentOffset += file_.write(word.data(), word.size()); - previousId = id; } // End offset of last vocabulary entry, also consistent with the empty // vocabulary. - if (previousId.has_value()) { - idsAndOffsets.push_back( - IndexAndOffset{previousId.value() + 1, currentOffset}); - } else { - idsAndOffsets.push_back(IndexAndOffset{highestIdx_ + 1, currentOffset}); - } + offsets.push_back(currentOffset); file_.close(); - } // After this close, the destructor of MmapVector is called, whoch dumps + } // After this close, the destructor of MmapVector is called, which dumps // everything to disk. open(fileName); } @@ -78,17 +57,13 @@ void VocabularyOnDisk::buildFromIterable(Iterable&& it, // _____________________________________________________________________________ VocabularyOnDisk::WordWriter::WordWriter(const std::string& outFilename) : file_{outFilename.c_str(), "w"}, - idsAndOffsets_{absl::StrCat(outFilename, VocabularyOnDisk::offsetSuffix_), - ad_utility::CreateTag{}} {} + offsets_{absl::StrCat(outFilename, VocabularyOnDisk::offsetSuffix_), + ad_utility::CreateTag{}} {} // _____________________________________________________________________________ void VocabularyOnDisk::WordWriter::operator()(std::string_view word) { - AD_CONTRACT_CHECK(!previousId_.has_value() || - previousId_.value() < currentIndex_); - idsAndOffsets_.push_back(IndexAndOffset{currentIndex_, currentOffset_}); + offsets_.push_back(currentOffset_); currentOffset_ += file_.write(word.data(), word.size()); - previousId_ = currentIndex_; - ++currentIndex_; } // _____________________________________________________________________________ @@ -98,10 +73,9 @@ void VocabularyOnDisk::WordWriter::finish() { } // End offset of last vocabulary entry, also consistent with the empty // vocabulary. - auto endIndex = previousId_.value_or(VocabularyOnDisk::highestIndexEmpty_); - idsAndOffsets_.push_back(IndexAndOffset{endIndex, currentOffset_}); + offsets_.push_back(currentOffset_); file_.close(); - idsAndOffsets_.close(); + offsets_.close(); } // _____________________________________________________________________________ @@ -120,21 +94,10 @@ void VocabularyOnDisk::buildFromStringsAndIds( // _____________________________________________________________________________ void VocabularyOnDisk::open(const std::string& filename) { file_.open(filename.c_str(), "r"); - idsAndOffsets_.open(filename + offsetSuffix_); - AD_CONTRACT_CHECK(idsAndOffsets_.size() > 0); - size_ = idsAndOffsets_.size() - 1; + offsets_.open(filename + offsetSuffix_); + AD_CORRECTNESS_CHECK(offsets_.size() > 0); + size_ = offsets_.size() - 1; if (size_ > 0) { - highestIdx_ = (*(end() - 1))._index; + highestIdx_ = size_ - 1; } } - -// ____________________________________________________________________________ -WordAndIndex VocabularyOnDisk::getIthElement(size_t n) const { - AD_CONTRACT_CHECK(n < idsAndOffsets_.size()); - auto offsetSizeId = getOffsetSizeIdForIthElement(n); - - std::string result(offsetSizeId._size, '\0'); - file_.read(result.data(), offsetSizeId._size, offsetSizeId._offset); - - return {std::move(result), offsetSizeId._id}; -} diff --git a/src/index/VocabularyOnDisk.h b/src/index/VocabularyOnDisk.h index dd45b2e98e..9bc258459a 100644 --- a/src/index/VocabularyOnDisk.h +++ b/src/index/VocabularyOnDisk.h @@ -19,24 +19,13 @@ // then binary search for a string can be performed. class VocabularyOnDisk { private: - // An ID and the offset of the corresponding word in the underlying file. - struct IndexAndOffset { - uint64_t idx_; - uint64_t offset_; - // Compare only by the IDs. - auto operator<=>(const IndexAndOffset& rhs) const { - return idx_ <=> rhs.idx_; - } - // Equality comparison is currently unused, but it must be declared to be - // able to use `std::ranges::lower_bound` etc. - bool operator==(const IndexAndOffset& rhs) const; - }; - + // The offset of a word in the underlying file. + using Offset = uint64_t; // The file in which the words are stored. mutable ad_utility::File file_; // The IDs and offsets of the words. - ad_utility::MmapVectorView idsAndOffsets_; + ad_utility::MmapVectorView offsets_; // The highest ID that occurs in the vocabulary. If the vocabulary is empty, // this will be Id(-1), s.t. highestIdx_ + 1 will overflow to 0. @@ -48,7 +37,7 @@ class VocabularyOnDisk { // This suffix is appended to the filename of the main file, in order to get // the name for the file in which IDs and offsets are stored. - static constexpr std::string_view offsetSuffix_ = ".idsAndOffsets"; + static constexpr std::string_view offsetSuffix_ = ".offsets"; public: // A helper class that is used to build a vocabulary word by word. @@ -59,10 +48,8 @@ class VocabularyOnDisk { class WordWriter { private: ad_utility::File file_; - ad_utility::MmapVector idsAndOffsets_; + ad_utility::MmapVector offsets_; uint64_t currentOffset_ = 0; - std::optional previousId_ = std::nullopt; - uint64_t currentIndex_ = 0; bool isFinished_ = false; ad_utility::ThrowInDestructorIfSafe throwInDestructorIfSafe_; @@ -87,9 +74,9 @@ class VocabularyOnDisk { /// this file, for example via `buildFromVector` or `buildFromTextFile`. void open(const std::string& filename); - /// If an entry with this `idx` exists, return the corresponding string, else - /// `std::nullopt` - std::optional operator[](uint64_t idx) const; + // Return the word that is stored at the index. Throw an exception if `idx >= + // size`. + std::string operator[](uint64_t idx) const; /// Get the number of words in the vocabulary. size_t size() const { return size_; } @@ -154,24 +141,15 @@ class VocabularyOnDisk { return iteratorToWordAndIndex(it); } - // The offset of a word in `file_`, its size in number of bytes and its ID - struct OffsetSizeId { - uint64_t _offset; - uint64_t _size; - uint64_t _id; - }; - // The offset of a word in `file_` and its size in number of bytes. struct OffsetAndSize { uint64_t _offset; uint64_t _size; - OffsetAndSize(OffsetSizeId osi) : _offset{osi._offset}, _size{osi._size} {} - OffsetAndSize() = default; }; // Helper function for implementing a random access iterator. using Accessor = decltype([](const auto& vocabulary, uint64_t index) { - return vocabulary.getIthElement(index); + return vocabulary[index]; }); // Const random access iterators, implemented via the // `IteratorForAccessOperator` template. @@ -181,11 +159,6 @@ class VocabularyOnDisk { const_iterator end() const { return {this, size()}; } private: - // Return the `i`-th element from this vocabulary. Note that this is (in - // general) NOT the element with the ID `i`, because the ID space is not - // contiguous. - WordAndIndex getIthElement(size_t n) const; - // Takes a lambda that compares two string-like objects and returns a lambda // that compares two objects, either of which can be string-like or // `WordAndIndex`. @@ -212,23 +185,13 @@ class VocabularyOnDisk { if (it == end()) { return {std::nullopt, getHighestId() + 1}; } else { - return *it; + return {*it, static_cast(it - begin())}; } } // Get the `OffsetAndSize` for the element with the `idx`. Return // `std::nullopt` if `idx` is not contained in the vocabulary. - std::optional getOffsetAndSize(uint64_t idx) const; - - // Return the `OffsetSizeId` for the element with the i-th smallest ID. - // Requires that i < size(). - OffsetSizeId getOffsetSizeIdForIthElement(uint64_t i) const; - - // Return the `OffsetAndSize` for the element with the i-th smallest ID. - // Requires that i < size(). - OffsetAndSize getOffsetAndSizeForIthElement(uint64_t i) const { - return getOffsetSizeIdForIthElement(i); - } + OffsetAndSize getOffsetAndSize(uint64_t idx) const; // Build a vocabulary from any type that is forward-iterable and yields // pairs of (string-like, ID). Used as the common implementation for diff --git a/test/VocabularyOnDiskTest.cpp b/test/VocabularyOnDiskTest.cpp index 130ff64577..22774821a1 100644 --- a/test/VocabularyOnDiskTest.cpp +++ b/test/VocabularyOnDiskTest.cpp @@ -110,18 +110,6 @@ TEST(VocabularyOnDisk, LowerUpperBoundStdLess) { createVocabularyFromDisk("lowerUpperBoundStdLess2")); } -TEST(VocabularyOnDisk, LowerUpperBoundStdLessNonContiguousIds) { - std::vector words{"alpha", "betta", "chimes", "someVery123Word"}; - std::vector ids{2, 4, 8, 42}; - VocabularyCreator creator1{"lowerUppperBoundStdLessNonContiguousIds1"}; - testUpperAndLowerBoundWithStdLessFromWordsAndIds( - creator1.createVocabularyImpl(words, ids), words, ids); - - VocabularyCreator creator2{"lowerUppperBoundStdLessNonContiguousIds2"}; - testUpperAndLowerBoundWithStdLessFromWordsAndIds( - creator2.createVocabularyFromDiskImpl(words, ids), words, ids); -} - TEST(VocabularyOnDisk, LowerUpperBoundNumeric) { testUpperAndLowerBoundWithNumericComparator( createVocabulary("lowerUpperBoundNumeric1")); @@ -129,18 +117,6 @@ TEST(VocabularyOnDisk, LowerUpperBoundNumeric) { createVocabularyFromDisk("lowerUpperBoundNumeric2")); } -TEST(VocabularyOnDisk, LowerUpperBoundNumericNonContiguousIds) { - std::vector words{"4", "33", "222", "1111"}; - std::vector ids{2, 4, 8, 42}; - - VocabularyCreator creator1{"lowerUpperBoundNumericNonContiguousIds1"}; - testUpperAndLowerBoundWithNumericComparatorFromWordsAndIds( - creator1.createVocabularyImpl(words, ids), words, ids); - VocabularyCreator creator2{"lowerUpperBoundNumericNonContiguousIds2"}; - testUpperAndLowerBoundWithNumericComparatorFromWordsAndIds( - creator2.createVocabularyFromDiskImpl(words, ids), words, ids); -} - TEST(VocabularyOnDisk, AccessOperator) { testAccessOperatorForUnorderedVocabulary(createVocabulary("AccessOperator1")); testAccessOperatorForUnorderedVocabulary( diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index 30e7b49062..83d216a6bc 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -46,7 +46,7 @@ std::vector getAllIndexFilenames( indexBasename + ".prefixes", indexBasename + ".vocabulary.internal", indexBasename + ".vocabulary.external", - indexBasename + ".vocabulary.external.idsAndOffsets"}; + indexBasename + ".vocabulary.external.offsets"}; } namespace { From b3aadb592231fe27c93e00a00b96b58cff291126 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 18 Apr 2024 00:50:09 +0200 Subject: [PATCH 3/5] Disable timing tests for both Docker and macOS builds (#1325) The Docker build that performs ARM-64 cross compilation has unreliable timing when run as part of a GitHub workflow. We therefore now disable timing tests not only for the macOS build (like before), but also for the Docker builds. That way the publication of Docker images is not prevented by spuriously failing timing tests anymore. --- .github/workflows/macos.yml | 2 +- CMakeLists.txt | 139 ++++++++++++++++---------------- Dockerfile | 2 +- test/CancellationHandleTest.cpp | 8 +- test/ProgressBarTest.cpp | 4 +- test/TimerTest.cpp | 12 +-- test/TurtleParserTest.cpp | 4 +- 7 files changed, 87 insertions(+), 84 deletions(-) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index d0efe2c515..9d03073613 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -66,7 +66,7 @@ jobs: conan install .. -pr:b=../conanprofiles/clang-16-macos -pr:h=../conanprofiles/clang-16-macos -of=. --build=missing; - name: Configure CMake # For std::ranges::join_view we need the -fexperimental-library flag on libc++16, which on Mac requires to manually tinker with the linking flags. - run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.build-type}} -DCMAKE_TOOLCHAIN_FILE="$(pwd)/build/conan_toolchain.cmake" -DUSE_PARALLEL=true -DRUN_EXPENSIVE_TESTS=false -DENABLE_EXPENSIVE_CHECKS=true -DCMAKE_CXX_COMPILER=clang++ -DADDITIONAL_COMPILER_FLAGS="-fexperimental-library" -DADDITIONAL_LINKER_FLAGS="-L$(brew --prefix llvm)/lib/c++" + run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.build-type}} -DCMAKE_TOOLCHAIN_FILE="$(pwd)/build/conan_toolchain.cmake" -DUSE_PARALLEL=true -DRUN_EXPENSIVE_TESTS=false -DENABLE_EXPENSIVE_CHECKS=true -DCMAKE_CXX_COMPILER=clang++ -DADDITIONAL_COMPILER_FLAGS="-fexperimental-library" -D_NO_TIMING_TESTS=ON -DADDITIONAL_LINKER_FLAGS="-L$(brew --prefix llvm)/lib/c++" - name: Build # Build your program with the given configuration diff --git a/CMakeLists.txt b/CMakeLists.txt index 379816c78b..45f338ade8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,10 +7,10 @@ set(CMAKE_C_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) -if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) +if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) # will not take effect without FORCE set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR} CACHE PATH "Install top-level directory" FORCE) -endif() +endif () # Boost::ASIO currently seems to have a bug when multiple coroutine streams are # concurrently in flight because there were multiple calls to `co_spawn`. @@ -24,20 +24,20 @@ add_definitions("-DBOOST_ASIO_DISABLE_AWAITABLE_FRAME_RECYCLING") # Coroutines require an additional compiler flag that is called differently # on clang and g++ -if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "11.0.0") - MESSAGE(FATAL_ERROR "G++ versions older than 11.0 are not supported by QLever") - else() - add_compile_options(-fcoroutines) - endif() - -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "16.0.0") +if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "11.0.0") + MESSAGE(FATAL_ERROR "G++ versions older than 11.0 are not supported by QLever") + else () + add_compile_options(-fcoroutines) + endif () + +elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "16.0.0") MESSAGE(FATAL_ERROR "Clang++ versions older than 16.0 are not supported by QLever") - endif() -else() + endif () +else () MESSAGE(FATAL_ERROR "QLever currently only supports the G++ or LLVM-Clang++ compilers. Found ${CMAKE_CXX_COMPILER_ID}") -endif() +endif () ## Build targets for address sanitizer # AddressSanitize @@ -70,7 +70,7 @@ include(FetchContent) FetchContent_Declare( googletest GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG f8d7d77c06936315286eb55f8de22cd23c188571 # release-1.14.0 + GIT_TAG f8d7d77c06936315286eb55f8de22cd23c188571 # release-1.14.0 ) ################################ @@ -87,7 +87,7 @@ FetchContent_GetProperties(nlohmann-json) if (NOT nlohmann-json_POPULATED) FetchContent_Populate(nlohmann-json) include_directories(SYSTEM ${nlohmann-json_SOURCE_DIR}/single_include) -endif() +endif () ############################### @@ -96,7 +96,7 @@ endif() FetchContent_Declare( antlr GIT_REPOSITORY https://github.com/antlr/antlr4.git - GIT_TAG 9239e6ff444420516b44b7621e8dc7691fcf0e16 + GIT_TAG 9239e6ff444420516b44b7621e8dc7691fcf0e16 ) # From ANTLR we actually don't want the toplevel directory (which doesn't @@ -108,10 +108,10 @@ FetchContent_GetProperties(antlr) if (NOT antlr_POPULATED) FetchContent_Populate(antlr) set(ANTLR_BUILD_CPP_TESTS OFF CACHE BOOL "don't try to build googletest twice") - add_subdirectory(${antlr_SOURCE_DIR}/runtime/Cpp EXCLUDE_FROM_ALL ) + add_subdirectory(${antlr_SOURCE_DIR}/runtime/Cpp EXCLUDE_FROM_ALL) target_compile_options(antlr4_static PRIVATE -Wno-all -Wno-extra -Wno-unqualified-std-cast-call -Wno-error -Wno-deprecated-declarations) - include_directories(SYSTEM ${antlr_SOURCE_DIR}/runtime/Cpp/runtime/src ) -endif() + include_directories(SYSTEM ${antlr_SOURCE_DIR}/runtime/Cpp/runtime/src) +endif () ################################ # Threading @@ -129,38 +129,37 @@ find_package(ICU 60 REQUIRED COMPONENTS uc i18n) find_package(jemalloc QUIET) if (TARGET jemalloc::jemalloc) - MESSAGE(STATUS "Use jemalloc that was installed via conan") - link_libraries(jemalloc::jemalloc) + MESSAGE(STATUS "Use jemalloc that was installed via conan") + link_libraries(jemalloc::jemalloc) elseif (${JEMALLOC_MANUALLY_INSTALLED}) - link_libraries(jemalloc) -else() - find_package(PkgConfig) - pkg_check_modules (JEMALLOC jemalloc) - - pkg_search_module(JEMALLOC jemalloc) - if (${JEMALLOC_FOUND}) - include_directories(${JEMALLOC_INCLUDE_DIRS}) - link_libraries(${JEMALLOC_LIBRARIES}) - else () - message(WARNING "Jemalloc could not be found via + link_libraries(jemalloc) +else () + find_package(PkgConfig) + pkg_check_modules(JEMALLOC jemalloc) + + pkg_search_module(JEMALLOC jemalloc) + if (${JEMALLOC_FOUND}) + include_directories(${JEMALLOC_INCLUDE_DIRS}) + link_libraries(${JEMALLOC_LIBRARIES}) + else () + message(WARNING "Jemalloc could not be found via pkg-config. If you are sure that you have installed jemalloc on your system (e.g. via `apt install libjemalloc-dev` on Ubuntu), you might try rerunning cmake with `-DJEMALLOC_MANUALLY_INSTALLED=True`. This is currently necessary on Ubuntu 18.04, where pkg-config does not find jemalloc. Continuing without jemalloc, this will impact the performance, most notably of the IndexBuilder") - endif() -endif() + endif () +endif () ### ZSTD find_package(ZSTD QUIET) if (TARGET zstd::libzstd_static) MESSAGE(STATUS "Use zstd that was installed via conan") link_libraries(zstd::libzstd_static) -else() -link_libraries(zstd) -endif() - +else () + link_libraries(zstd) +endif () ###################################### @@ -182,15 +181,15 @@ find_package(OpenSSL REQUIRED) # function `qlever_target_link_libraries` for all libraries and executables. It # is a drop-in replacement for `target_link_libraries` that additionally links # against the common libraries. -function (qlever_target_link_libraries target) +function(qlever_target_link_libraries target) target_link_libraries(${target} ${ARGN} absl::flat_hash_map - absl::flat_hash_set absl::strings absl::str_format ICU::uc - ICU::i18n OpenSSL::SSL OpenSSL::Crypto GTest::gtest GTest::gmock stxxl fsst) + absl::flat_hash_set absl::strings absl::str_format ICU::uc + ICU::i18n OpenSSL::SSL OpenSSL::Crypto GTest::gtest GTest::gmock stxxl fsst) # memorySize is a utility library for defining memory sizes. if (NOT ${target} STREQUAL "memorySize") target_link_libraries(${target} memorySize) - endif() + endif () endfunction() @@ -207,7 +206,6 @@ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS} set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}") - if (${PERFTOOLS_PROFILER}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lprofiler") message(STATUS "Adding -lprofiler (make sure your have google-perftools installed.)") @@ -229,9 +227,9 @@ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") # CTRE, Compile-Time-Regular-Expressions ################################ FetchContent_Declare( - ctre - GIT_REPOSITORY https://github.com/hanickadot/compile-time-regular-expressions.git - GIT_TAG b3d7788b559e34d985c8530c3e0e7260b67505a6 # v3.8.1 + ctre + GIT_REPOSITORY https://github.com/hanickadot/compile-time-regular-expressions.git + GIT_TAG b3d7788b559e34d985c8530c3e0e7260b67505a6 # v3.8.1 ) ################################ @@ -254,29 +252,34 @@ if (USE_PARALLEL) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") add_definitions("-D_PARALLEL_SORT") endif () -endif() +endif () + +OPTION(_NO_TIMING_TESTS "Disable timing tests on platforms where `sleep` is unreliable" OFF) +if (_NO_TIMING_TESTS) + add_definitions("-D_QLEVER_NO_TIMING_TESTS") +endif () if (USE_TREE_BASED_CACHE) add_definitions("-D_QLEVER_USE_TREE_BASED_CACHE") -endif() +endif () if (RUN_EXPENSIVE_TESTS) message(STATUS "Running expensive unit tests. This is only recommended in release builds") add_definitions("-DQLEVER_RUN_EXPENSIVE_TESTS") -endif() +endif () if (ENABLE_EXPENSIVE_CHECKS) message(STATUS "Enabling checks that potentially have a significant runtime overhead") add_definitions("-DAD_ENABLE_EXPENSIVE_CHECKS") -endif() +endif () set(QUERY_CANCELLATION_MODE "ENABLED" CACHE STRING "Option to allow disabling cancellation checks partially or completely to reduce the overhead of this mechanism during query computation.") # Hint for cmake gui, but not actually enforced set_property(CACHE QUERY_CANCELLATION_MODE PROPERTY STRINGS "ENABLED" "NO_WATCH_DOG" "DISABLED") # So enforce this ourselves -if(QUERY_CANCELLATION_MODE AND NOT QUERY_CANCELLATION_MODE MATCHES "ENABLED|NO_WATCH_DOG|DISABLED") +if (QUERY_CANCELLATION_MODE AND NOT QUERY_CANCELLATION_MODE MATCHES "ENABLED|NO_WATCH_DOG|DISABLED") message(FATAL_ERROR "Invalid value for QUERY_CANCELLATION_MODE '${QUERY_CANCELLATION_MODE}'. Please remove the option entirely or change it to ENABLED, NO_WATCH_DOG or DISABLED.") -endif() +endif () add_definitions("-DQUERY_CANCELLATION_MODE=${QUERY_CANCELLATION_MODE}") ################################ @@ -322,9 +325,9 @@ FetchContent_MakeAvailable(googletest ctre abseil re2 stxxl fsst) # Disable some warnings in RE2, STXXL, and GTEST target_compile_options(re2 PRIVATE -Wno-unused-parameter) target_compile_options(stxxl PRIVATE -Wno-deprecated-declarations) -if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - target_compile_options(gtest PRIVATE -Wno-maybe-uninitialized) -endif() +if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + target_compile_options(gtest PRIVATE -Wno-maybe-uninitialized) +endif () include_directories(${ctre_SOURCE_DIR}/single-header) target_compile_options(fsst PRIVATE -Wno-extra -Wno-all -Wno-error) target_compile_options(fsst12 PRIVATE -Wno-extra -Wno-all -Wno-error) @@ -353,13 +356,13 @@ if (NOT DONT_UPDATE_COMPILATION_INFO) # The first output which is never created is necessary s.t. the command is never cached and # always rerun. add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/FileThatNeverExists.cpp" - "${CMAKE_CURRENT_BINARY_DIR}/CompilationInfo.cpp" - COMMAND cmake -P ${CMAKE_CURRENT_SOURCE_DIR}/CompilationInfo.cmake) -else() + "${CMAKE_CURRENT_BINARY_DIR}/CompilationInfo.cpp" + COMMAND cmake -P ${CMAKE_CURRENT_SOURCE_DIR}/CompilationInfo.cmake) +else () add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/CompilationInfo.cpp" COMMAND cmake -P ${CMAKE_CURRENT_SOURCE_DIR}/CompilationInfo.cmake) -endif() +endif () set(LOG_LEVEL_FATAL FATAL) set(LOG_LEVEL_ERROR ERROR) @@ -370,11 +373,11 @@ set(LOG_LEVEL_TIMING TIMING) set(LOG_LEVEL_TRACE TRACE) -if(CMAKE_BUILD_TYPE MATCHES DEBUG) - set(LOGLEVEL DEBUG CACHE STRING "The loglevel") -else() - set(LOGLEVEL INFO CACHE STRING "The loglevel") -endif() +if (CMAKE_BUILD_TYPE MATCHES DEBUG) + set(LOGLEVEL DEBUG CACHE STRING "The loglevel") +else () + set(LOGLEVEL INFO CACHE STRING "The loglevel") +endif () set_property(CACHE LOGLEVEL PROPERTY STRINGS FATAL ERROR WARN INFO DEBUG TIMING TRACE) add_definitions(-DLOGLEVEL=${LOG_LEVEL_${LOGLEVEL}}) @@ -382,9 +385,9 @@ add_definitions(-DLOGLEVEL=${LOG_LEVEL_${LOGLEVEL}}) ################################################## # Warnings about incorrect combination of CMake variables -if(LOGLEVEL MATCHES "FATAL|ERROR" AND QUERY_CANCELLATION_MODE EQUAL "ENABLED") +if (LOGLEVEL MATCHES "FATAL|ERROR" AND QUERY_CANCELLATION_MODE EQUAL "ENABLED") message(WARNING "Log level is not printing logs with level WARN, which is necessary when QUERY_CANCELLATION_MODE=ENABLED for it to work properly") -endif() +endif () ################################################## # Precompiled headers @@ -411,7 +414,7 @@ add_executable(IndexBuilderMain src/index/IndexBuilderMain.cpp) qlever_target_link_libraries(IndexBuilderMain index ${CMAKE_THREAD_LIBS_INIT} Boost::program_options) add_executable(ServerMain src/ServerMain.cpp) -qlever_target_link_libraries (ServerMain engine ${CMAKE_THREAD_LIBS_INIT} Boost::program_options) +qlever_target_link_libraries(ServerMain engine ${CMAKE_THREAD_LIBS_INIT} Boost::program_options) target_precompile_headers(ServerMain REUSE_FROM engine) add_executable(VocabularyMergerMain src/VocabularyMergerMain.cpp) diff --git a/Dockerfile b/Dockerfile index 611c014507..b2d2945de8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ WORKDIR /app/ ENV DEBIAN_FRONTEND=noninteractive WORKDIR /app/build/ -RUN cmake -DCMAKE_BUILD_TYPE=Release -DLOGLEVEL=INFO -DUSE_PARALLEL=true -GNinja .. && ninja +RUN cmake -DCMAKE_BUILD_TYPE=Release -DLOGLEVEL=INFO -DUSE_PARALLEL=true -D_NO_TIMING_TESTS=ON -GNinja .. && ninja RUN ctest --rerun-failed --output-on-failure FROM base as runtime diff --git a/test/CancellationHandleTest.cpp b/test/CancellationHandleTest.cpp index 3515165e3f..241e2c9005 100644 --- a/test/CancellationHandleTest.cpp +++ b/test/CancellationHandleTest.cpp @@ -153,8 +153,8 @@ TEST(CancellationHandle, ensureObjectLifetimeIsValidWithoutWatchDogStarted) { namespace ad_utility { TEST(CancellationHandle, verifyWatchDogDoesChangeState) { -#ifdef __APPLE__ - GTEST_SKIP_("sleep_for is unreliable for macos builds"); +#ifdef _QLEVER_NO_TIMING_TESTS + GTEST_SKIP_("because _QLEVER_NO_TIMING_TESTS defined"); #endif CancellationHandle handle; @@ -172,8 +172,8 @@ TEST(CancellationHandle, verifyWatchDogDoesChangeState) { // _____________________________________________________________________________ TEST(CancellationHandle, verifyWatchDogDoesNotChangeStateAfterCancel) { -#ifdef __APPLE__ - GTEST_SKIP_("sleep_for is unreliable for macos builds"); +#ifdef _QLEVER_NO_TIMING_TESTS + GTEST_SKIP_("because _QLEVER_NO_TIMING_TESTS defined"); #endif CancellationHandle handle; handle.startWatchDog(); diff --git a/test/ProgressBarTest.cpp b/test/ProgressBarTest.cpp index 53e2dca917..2df1601c1c 100644 --- a/test/ProgressBarTest.cpp +++ b/test/ProgressBarTest.cpp @@ -29,7 +29,7 @@ TEST(ProgressBar, typicalUsage) { // NOTE: For macOS, `std::this_thread::sleep_for` can take much longer // than indicated, resulting in a much lower speed than expected. std::string expectedSpeedRegex = -#ifndef __APPLE__ +#ifndef _QLEVER_NO_TIMING_TESTS "\\[average speed [234]\\.[0-9] M/s, last batch [234]\\.[0-9] M/s" ", fastest [234]\\.[0-9] M/s, slowest [234]\\.[0-9] M/s\\] "; #else @@ -67,7 +67,7 @@ TEST(ProgressBar, numberOfStepsLessThanBatchSize) { ProgressBar progressBar(numSteps, "Steps: ", 5'000); std::this_thread::sleep_for(std::chrono::milliseconds(1)); std::string expectedUpdateRegex = -#ifndef __APPLE__ +#ifndef _QLEVER_NO_TIMING_TESTS "Steps: 3,000 \\[average speed [234]\\.[0-9] M/s\\] \n"; #else "Steps: 3,000 \\[average speed [0-9]\\.[0-9] M/s\\] \n"; diff --git a/test/TimerTest.cpp b/test/TimerTest.cpp index 4a0f462ffd..f14e808333 100644 --- a/test/TimerTest.cpp +++ b/test/TimerTest.cpp @@ -32,8 +32,8 @@ void testTime(const ad_utility::Timer& timer, } TEST(Timer, BasicWorkflow) { -#ifdef __APPLE__ - GTEST_SKIP_("sleep_for is unreliable for macos builds"); +#ifdef _QLEVER_NO_TIMING_TESTS + GTEST_SKIP_("because _QLEVER_NO_TIMING_TESTS defined"); #endif Timer t{Timer::Started}; ASSERT_TRUE(t.isRunning()); @@ -85,8 +85,8 @@ TEST(Timer, BasicWorkflow) { } TEST(Timer, InitiallyStopped) { -#ifdef __APPLE__ - GTEST_SKIP_("sleep_for is unreliable for macos builds"); +#ifdef _QLEVER_NO_TIMING_TESTS + GTEST_SKIP_("because _QLEVER_NO_TIMING_TESTS defined"); #endif Timer t{Timer::Stopped}; ASSERT_FALSE(t.isRunning()); @@ -102,8 +102,8 @@ TEST(Timer, InitiallyStopped) { } TEST(TimeBlockAndLog, TimeBlockAndLog) { -#ifdef __APPLE__ - GTEST_SKIP_("sleep_for is unreliable for macos builds"); +#ifdef _QLEVER_NO_TIMING_TESTS + GTEST_SKIP_("because _QLEVER_NO_TIMING_TESTS defined"); #endif std::string s; { diff --git a/test/TurtleParserTest.cpp b/test/TurtleParserTest.cpp index 85c02977fd..215fd065bd 100644 --- a/test/TurtleParserTest.cpp +++ b/test/TurtleParserTest.cpp @@ -808,8 +808,8 @@ TEST(TurtleParserTest, exceptionPropagationFileBufferReading) { // blocking, even when there are still lots of blocks in the pipeline that are // currently being parsed. TEST(TurtleParserTest, stopParsingOnOutsideFailure) { -#ifdef __APPLE__ - GTEST_SKIP_("sleep_for is unreliable for macos builds"); +#ifdef _QLEVER_NO_TIMING_TESTS + GTEST_SKIP_("because _QLEVER_NO_TIMING_TESTS defined"); #endif std::string filename{"turtleParserStopParsingOnOutsideFailure.dat"}; auto testWithParser = [&]( From c562e91f18c449f0e918286b22b2fddad876c1ae Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 18 Apr 2024 00:53:06 +0200 Subject: [PATCH 4/5] Fix a subtle bug in `CompressedExternalIdTableSorter` (#1328) Under certain (rare) conditions, `CompressedExternalIdTableSorter::sortedBlocks` produced empty blocks, which led to an exception. This is now fixed. Also make sure that an exception in the building of a permutation pair does not result in a deadlock anymore (as it did for the bug just mentioned), but in a proper error message. --- src/engine/idTable/CompressedExternalIdTable.h | 7 ++++++- src/index/IndexImpl.cpp | 5 +++++ test/engine/idTable/CompressedExternalIdTableTest.cpp | 7 ++++++- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/engine/idTable/CompressedExternalIdTable.h b/src/engine/idTable/CompressedExternalIdTable.h index aacb154d4f..d444db8e69 100644 --- a/src/engine/idTable/CompressedExternalIdTable.h +++ b/src/engine/idTable/CompressedExternalIdTable.h @@ -667,6 +667,9 @@ class CompressedExternalIdTableSorter // requested for the output, and the single block is larger than this // blocksize, we manually have to split it into chunks. auto& block = this->currentBlock_; + if (block.empty()) { + co_return; + } const auto blocksizeOutput = blocksize.value_or(block.numRows()); if (block.numRows() <= blocksizeOutput) { if (this->moveResultOnMerge_) { @@ -735,7 +738,9 @@ class CompressedExternalIdTableSorter } } numPopped += result.numRows(); - co_yield std::move(result).template toStatic(); + if (!result.empty()) { + co_yield std::move(result).template toStatic(); + } AD_CORRECTNESS_CHECK(numPopped == this->numElementsPushed_); } diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 6a6ab048c1..705dc182dc 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -208,6 +208,11 @@ std::unique_ptr> IndexImpl::buildOspWithPatterns( // S P O PatternOfS PatternOfO, sorted by OPS. auto blockGenerator = [](auto& queue) -> cppcoro::generator> { + // If an exception occurs in the block that is consuming the blocks yielded + // from this generator, we have to explicitly finish the `queue`, otherwise + // there will be a deadlock because the threads involved in the queue can + // never join. + absl::Cleanup cl{[&queue]() { queue.finish(); }}; while (auto block = queue.pop()) { co_yield fixBlockAfterPatternJoin(std::move(block)); } diff --git a/test/engine/idTable/CompressedExternalIdTableTest.cpp b/test/engine/idTable/CompressedExternalIdTableTest.cpp index 5d679ef814..9bc253f29b 100644 --- a/test/engine/idTable/CompressedExternalIdTableTest.cpp +++ b/test/engine/idTable/CompressedExternalIdTableTest.cpp @@ -107,8 +107,13 @@ void testExternalSorterImpl(size_t numDynamicColumns, size_t numRows, } for (size_t k = 0; k < 5; ++k) { - auto generator = writer.sortedView(); + // Also test the case that the blocksize does not exactly divides the + // number of inputs. + auto blocksize = k == 1 ? 1 : 17; using namespace ::testing; + auto generator = k == 0 ? std::views::join(ad_utility::OwningView{ + writer.getSortedBlocks(blocksize)}) + : writer.sortedView(); if (mergeMultipleTimes || k == 0) { auto result = idTableFromRowGenerator( generator, numDynamicColumns); From 853d877272bd5c2375d02826d147d86edfc3db59 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 18 Apr 2024 15:43:13 +0200 Subject: [PATCH 5/5] Simplify the `QueryExecutionTree` class (#1327) Removes `Operation::getType` and replaces it with `dynamic_cast` checks which in my microbenchmarks had almost identical performance (10-20ns difference). This makes the adding of additional operations much simpler. Also perform some minor cleanups on the codebase (like consistent formatting of include paths). --- benchmark/GroupByHashMapBenchmark.cpp | 2 +- src/engine/IndexScan.cpp | 3 +- src/engine/IndexScan.h | 10 +- src/engine/Join.cpp | 38 +++---- src/engine/Operation.cpp | 3 +- src/engine/Operation.h | 13 ++- src/engine/QueryExecutionTree.cpp | 125 ++--------------------- src/engine/QueryExecutionTree.h | 71 +++---------- src/engine/QueryPlanner.cpp | 66 ++++++------ src/engine/QueryPlanner.h | 6 +- src/parser/Literal.h | 3 + src/util/Conversions.h | 4 +- test/SparqlParserTest.cpp | 3 +- test/engine/CartesianProductJoinTest.cpp | 2 +- {src => test}/engine/ValuesForTesting.h | 0 test/util/IdTableHelpers.cpp | 4 +- test/util/IdTableHelpers.h | 2 +- test/util/OperationTestHelpers.h | 6 -- 18 files changed, 104 insertions(+), 257 deletions(-) rename {src => test}/engine/ValuesForTesting.h (100%) diff --git a/benchmark/GroupByHashMapBenchmark.cpp b/benchmark/GroupByHashMapBenchmark.cpp index 6c05da3b7a..8b32efa0fd 100644 --- a/benchmark/GroupByHashMapBenchmark.cpp +++ b/benchmark/GroupByHashMapBenchmark.cpp @@ -6,12 +6,12 @@ #include #include "../benchmark/infrastructure/Benchmark.h" +#include "../test/engine/ValuesForTesting.h" #include "../test/util/IdTableHelpers.h" #include "../test/util/IndexTestHelpers.h" #include "engine/GroupBy.h" #include "engine/Sort.h" #include "engine/Values.h" -#include "engine/ValuesForTesting.h" #include "engine/sparqlExpressions/AggregateExpression.h" #include "engine/sparqlExpressions/GroupConcatExpression.h" #include "engine/sparqlExpressions/LiteralExpression.h" diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp index c04774e74b..15ecb9fa83 100644 --- a/src/engine/IndexScan.cpp +++ b/src/engine/IndexScan.cpp @@ -4,10 +4,11 @@ #include "engine/IndexScan.h" +#include + #include #include -#include "absl/strings/str_join.h" #include "index/IndexImpl.h" #include "index/TriplesView.h" #include "parser/ParsedQuery.h" diff --git a/src/engine/IndexScan.h b/src/engine/IndexScan.h index 26f8c3f694..900917f407 100644 --- a/src/engine/IndexScan.h +++ b/src/engine/IndexScan.h @@ -7,8 +7,6 @@ #include "./Operation.h" -using std::string; - class SparqlTriple; class SparqlTripleSimple; @@ -43,7 +41,7 @@ class IndexScan : public Operation { const std::vector& additionalColumns() const { return additionalColumns_; } - string getDescriptor() const override; + std::string getDescriptor() const override; size_t getResultWidth() const override; @@ -93,6 +91,10 @@ class IndexScan : public Operation { bool knownEmptyResult() override { return getExactSize() == 0; } + bool isIndexScanWithNumVariables(size_t target) const override { + return numVariables() == target; + } + // Currently only the full scans support a limit clause. [[nodiscard]] bool supportsLimit() const override { return getResultWidth() == 3; @@ -114,7 +116,7 @@ class IndexScan : public Operation { size_t computeSizeEstimate() const; - string getCacheKeyImpl() const override; + std::string getCacheKeyImpl() const override; VariableToColumnMap computeVariableToColumnMap() const override; diff --git a/src/engine/Join.cpp b/src/engine/Join.cpp index aac3cbad80..cba6ab70fb 100644 --- a/src/engine/Join.cpp +++ b/src/engine/Join.cpp @@ -47,8 +47,8 @@ Join::Join(QueryExecutionContext* qec, std::shared_ptr t1, // If one of the inputs is a SCAN and the other one is not, always make the // SCAN the right child (which also gives a deterministic order of the // subtrees). This simplifies several branches in the `computeResult` method. - if (t1->getType() == QueryExecutionTree::SCAN && - t2->getType() != QueryExecutionTree::SCAN) { + if (std::dynamic_pointer_cast(t1->getRootOperation()) && + !std::dynamic_pointer_cast(t2->getRootOperation())) { swapChildren(); } _left = std::move(t1); @@ -131,12 +131,14 @@ ResultTable Join::computeResult() { auto rightResIfCached = getCachedOrSmallResult(*_right, _rightJoinCol); checkCancellation(); - if (_left->getType() == QueryExecutionTree::SCAN && - _right->getType() == QueryExecutionTree::SCAN) { + auto leftIndexScan = + std::dynamic_pointer_cast(_left->getRootOperation()); + if (leftIndexScan && + std::dynamic_pointer_cast(_right->getRootOperation())) { if (rightResIfCached && !leftResIfCached) { idTable = computeResultForIndexScanAndIdTable( - rightResIfCached->idTable(), _rightJoinCol, - dynamic_cast(*_left->getRootOperation()), _leftJoinCol); + rightResIfCached->idTable(), _rightJoinCol, *leftIndexScan, + _leftJoinCol); checkCancellation(); return {std::move(idTable), resultSortedOn(), LocalVocab{}}; @@ -169,11 +171,11 @@ ResultTable Join::computeResult() { const auto& leftIdTable = leftRes->idTable(); auto leftHasUndef = !leftIdTable.empty() && leftIdTable.at(0, _leftJoinCol).isUndefined(); - if (_right->getType() == QueryExecutionTree::SCAN && !rightResIfCached && - !leftHasUndef) { + auto rightIndexScan = + std::dynamic_pointer_cast(_right->getRootOperation()); + if (rightIndexScan && !rightResIfCached && !leftHasUndef) { idTable = computeResultForIndexScanAndIdTable( - leftRes->idTable(), _leftJoinCol, - dynamic_cast(*_right->getRootOperation()), _rightJoinCol); + leftRes->idTable(), _leftJoinCol, *rightIndexScan, _rightJoinCol); checkCancellation(); return {std::move(idTable), resultSortedOn(), leftRes->getSharedLocalVocab()}; @@ -565,8 +567,11 @@ void updateRuntimeInfoForLazyScan( // ______________________________________________________________________________________________________ IdTable Join::computeResultForTwoIndexScans() { - AD_CORRECTNESS_CHECK(_left->getType() == QueryExecutionTree::SCAN && - _right->getType() == QueryExecutionTree::SCAN); + auto leftScan = + std::dynamic_pointer_cast(_left->getRootOperation()); + auto rightScan = + std::dynamic_pointer_cast(_right->getRootOperation()); + AD_CORRECTNESS_CHECK(leftScan && rightScan); // The join column already is the first column in both inputs, so we don't // have to permute the inputs and results for the `AddCombinedRowToIdTable` // class to work correctly. @@ -575,12 +580,9 @@ IdTable Join::computeResultForTwoIndexScans() { 1, IdTable{getResultWidth(), getExecutionContext()->getAllocator()}, cancellationHandle_}; - auto& leftScan = dynamic_cast(*_left->getRootOperation()); - auto& rightScan = dynamic_cast(*_right->getRootOperation()); - ad_utility::Timer timer{ad_utility::timer::Timer::InitialStatus::Started}; auto [leftBlocksInternal, rightBlocksInternal] = - IndexScan::lazyScanForJoinOfTwoScans(leftScan, rightScan); + IndexScan::lazyScanForJoinOfTwoScans(*leftScan, *rightScan); runtimeInfo().addDetail("time-for-filtering-blocks", timer.msecs()); auto leftBlocks = convertGenerator(std::move(leftBlocksInternal)); @@ -589,8 +591,8 @@ IdTable Join::computeResultForTwoIndexScans() { ad_utility::zipperJoinForBlocksWithoutUndef(leftBlocks, rightBlocks, std::less{}, rowAdder); - updateRuntimeInfoForLazyScan(leftScan, leftBlocks.details()); - updateRuntimeInfoForLazyScan(rightScan, rightBlocks.details()); + updateRuntimeInfoForLazyScan(*leftScan, leftBlocks.details()); + updateRuntimeInfoForLazyScan(*rightScan, rightBlocks.details()); AD_CORRECTNESS_CHECK(leftBlocks.details().numBlocksRead_ <= rightBlocks.details().numElementsRead_); diff --git a/src/engine/Operation.cpp b/src/engine/Operation.cpp index ee888276ed..46649e7e96 100644 --- a/src/engine/Operation.cpp +++ b/src/engine/Operation.cpp @@ -100,8 +100,7 @@ shared_ptr Operation::getResult(bool isRoot, auto lock = getExecutionContext()->getQueryTreeCache().pinnedSizes().wlock(); forAllDescendants([&lock](QueryExecutionTree* child) { - if (child->getType() == QueryExecutionTree::OperationType::SCAN && - child->getResultWidth() == 1) { + if (child->getRootOperation()->isIndexScanWithNumVariables(1)) { (*lock)[child->getRootOperation()->getCacheKey()] = child->getSizeEstimate(); } diff --git a/src/engine/Operation.h b/src/engine/Operation.h index af44ed2a53..07ea31dce7 100644 --- a/src/engine/Operation.h +++ b/src/engine/Operation.h @@ -7,9 +7,7 @@ #pragma once #include -#include #include -#include #include "engine/QueryExecutionContext.h" #include "engine/ResultTable.h" @@ -19,9 +17,6 @@ #include "parser/data/Variable.h" #include "util/CancellationHandle.h" #include "util/CompilerExtensions.h" -#include "util/Exception.h" -#include "util/Log.h" -#include "util/TypeTraits.h" // forward declaration needed to break dependencies class QueryExecutionTree; @@ -117,6 +112,14 @@ class Operation { virtual void setSelectedVariablesForSubquery( const std::vector& selectedVariables) final; + /// Return true if this object is an instance of `IndexScan` and has the + /// specified number of variables. For this to work this function needs to + /// be overridden by `IndexScan` to do the right thing. + virtual bool isIndexScanWithNumVariables( + [[maybe_unused]] size_t target) const { + return false; + } + RuntimeInformation& runtimeInfo() const { return *_runtimeInfo; } std::shared_ptr getRuntimeInfoPointer() { diff --git a/src/engine/QueryExecutionTree.cpp b/src/engine/QueryExecutionTree.cpp index 831193d386..0b195338bc 100644 --- a/src/engine/QueryExecutionTree.cpp +++ b/src/engine/QueryExecutionTree.cpp @@ -6,37 +6,13 @@ #include "./QueryExecutionTree.h" -#include -#include +#include +#include +#include #include -#include +#include -#include "engine/Bind.h" -#include "engine/CartesianProductJoin.h" -#include "engine/CountAvailablePredicates.h" -#include "engine/Distinct.h" -#include "engine/ExportQueryExecutionTrees.h" -#include "engine/Filter.h" -#include "engine/GroupBy.h" -#include "engine/HasPredicateScan.h" -#include "engine/IndexScan.h" -#include "engine/Join.h" -#include "engine/Minus.h" -#include "engine/MultiColumnJoin.h" -#include "engine/NeutralElementOperation.h" -#include "engine/OptionalJoin.h" -#include "engine/OrderBy.h" -#include "engine/Service.h" #include "engine/Sort.h" -#include "engine/TextIndexScanForEntity.h" -#include "engine/TextIndexScanForWord.h" -#include "engine/TransitivePathBase.h" -#include "engine/Union.h" -#include "engine/Values.h" -#include "engine/ValuesForTesting.h" -#include "parser/RdfEscaping.h" - -using std::string; using parsedQuery::SelectClause; @@ -45,7 +21,7 @@ QueryExecutionTree::QueryExecutionTree(QueryExecutionContext* const qec) : qec_(qec) {} // _____________________________________________________________________________ -string QueryExecutionTree::getCacheKey() const { +std::string QueryExecutionTree::getCacheKey() const { return rootOperation_->getCacheKey(); } @@ -94,7 +70,7 @@ size_t QueryExecutionTree::getCostEstimate() { // result is pinned in cache. Nothing to compute return 0; } - if (type_ == QueryExecutionTree::SCAN && getResultWidth() == 1) { + if (getRootOperation()->isIndexScanWithNumVariables(1)) { return getSizeEstimate(); } else { return rootOperation_->getCostEstimate(); @@ -142,95 +118,6 @@ void QueryExecutionTree::readFromCache() { } } -template -void QueryExecutionTree::setOperation(std::shared_ptr operation) { - if constexpr (std::is_same_v) { - type_ = SCAN; - } else if constexpr (std::is_same_v) { - type_ = UNION; - } else if constexpr (std::is_same_v) { - type_ = BIND; - } else if constexpr (std::is_same_v) { - type_ = SORT; - } else if constexpr (std::is_same_v) { - type_ = DISTINCT; - } else if constexpr (std::is_same_v) { - type_ = VALUES; - } else if constexpr (std::is_same_v) { - type_ = SERVICE; - } else if constexpr (std::is_same_v) { - type_ = TRANSITIVE_PATH; - } else if constexpr (std::is_same_v) { - type_ = ORDER_BY; - } else if constexpr (std::is_same_v) { - type_ = GROUP_BY; - } else if constexpr (std::is_same_v) { - type_ = HAS_PREDICATE_SCAN; - } else if constexpr (std::is_same_v) { - type_ = FILTER; - } else if constexpr (std::is_same_v) { - type_ = NEUTRAL_ELEMENT; - } else if constexpr (std::is_same_v) { - type_ = JOIN; - } else if constexpr (std::is_same_v) { - type_ = TEXT_INDEX_SCAN_FOR_WORD; - } else if constexpr (std::is_same_v) { - type_ = TEXT_INDEX_SCAN_FOR_ENTITY; - } else if constexpr (std::is_same_v) { - type_ = COUNT_AVAILABLE_PREDICATES; - } else if constexpr (std::is_same_v) { - type_ = MINUS; - } else if constexpr (std::is_same_v) { - type_ = OPTIONAL_JOIN; - } else if constexpr (std::is_same_v) { - type_ = MULTICOLUMN_JOIN; - } else if constexpr (std::is_same_v || - std::is_same_v) { - type_ = DUMMY; - } else if constexpr (std::is_same_v) { - type_ = CARTESIAN_PRODUCT_JOIN; - } else { - static_assert(ad_utility::alwaysFalse, - "New type of operation that was not yet registered"); - } - rootOperation_ = std::move(operation); - readFromCache(); -} - -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation( - std::shared_ptr); -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation( - std::shared_ptr); -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation( - std::shared_ptr); -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation( - std::shared_ptr); -template void QueryExecutionTree::setOperation( - std::shared_ptr); -template void QueryExecutionTree::setOperation( - std::shared_ptr); -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation(std::shared_ptr); -template void QueryExecutionTree::setOperation( - std::shared_ptr); -template void QueryExecutionTree::setOperation( - std::shared_ptr); -template void QueryExecutionTree::setOperation( - std::shared_ptr); -template void QueryExecutionTree::setOperation( - std::shared_ptr); - // ________________________________________________________________________________________________________________ std::shared_ptr QueryExecutionTree::createSortedTree( std::shared_ptr qet, diff --git a/src/engine/QueryExecutionTree.h b/src/engine/QueryExecutionTree.h index 10c2d74457..8f8b28eff5 100644 --- a/src/engine/QueryExecutionTree.h +++ b/src/engine/QueryExecutionTree.h @@ -6,64 +6,27 @@ #include #include #include -#include -#include +#include #include "engine/Operation.h" #include "engine/QueryExecutionContext.h" #include "parser/ParsedQuery.h" -#include "parser/data/ConstructQueryExportContext.h" #include "parser/data/Types.h" -#include "util/Conversions.h" -#include "util/Generator.h" -#include "util/HashSet.h" #include "util/stream_generator.h" -using std::shared_ptr; -using std::string; - // A query execution tree. Processed bottom up, which gives an ordering to the // operations needed to solve a query. class QueryExecutionTree { public: - explicit QueryExecutionTree(QueryExecutionContext* const qec); - template - QueryExecutionTree(QueryExecutionContext* const qec, - std::shared_ptr operation) + explicit QueryExecutionTree(QueryExecutionContext* qec); + QueryExecutionTree(QueryExecutionContext* qec, + std::shared_ptr operation) : QueryExecutionTree(qec) { - setOperation(std::move(operation)); + rootOperation_ = std::move(operation); + readFromCache(); } - enum OperationType { - UNDEFINED, - SCAN, - JOIN, - SORT, - ORDER_BY, - FILTER, - DISTINCT, - TEXT_INDEX_SCAN_FOR_WORD, - TEXT_INDEX_SCAN_FOR_ENTITY, - OPTIONAL_JOIN, - COUNT_AVAILABLE_PREDICATES, - GROUP_BY, - HAS_PREDICATE_SCAN, - UNION, - MULTICOLUMN_JOIN, - TRANSITIVE_PATH, - VALUES, - SERVICE, - BIND, - MINUS, - NEUTRAL_ELEMENT, - DUMMY, - CARTESIAN_PRODUCT_JOIN - }; - - template - void setOperation(std::shared_ptr); - - string getCacheKey() const; + std::string getCacheKey() const; const QueryExecutionContext* getQec() const { return qec_; } @@ -82,17 +45,13 @@ class QueryExecutionTree { std::shared_ptr getRootOperation() const { return rootOperation_; } - const OperationType& getType() const { return type_; } - - bool isEmpty() const { - return type_ == OperationType::UNDEFINED || !rootOperation_; - } + bool isEmpty() const { return !rootOperation_; } size_t getVariableColumn(const Variable& variable) const; size_t getResultWidth() const { return rootOperation_->getResultWidth(); } - shared_ptr getResult() const { + std::shared_ptr getResult() const { return rootOperation_->getResult(isRoot()); } @@ -103,7 +62,8 @@ class QueryExecutionTree { size_t columnIndex_; }; - using ColumnIndicesAndTypes = vector>; + using ColumnIndicesAndTypes = + std::vector>; // Returns a vector where the i-th element contains the column index and // `ResultType` of the i-th `selectVariable` in the `resultTable` @@ -144,7 +104,7 @@ class QueryExecutionTree { void readFromCache(); // recursively get all warnings from descendant operations - vector collectWarnings() const { + std::vector collectWarnings() const { return rootOperation_->collectWarnings(); } @@ -181,17 +141,17 @@ class QueryExecutionTree { // sorted accordingly, it is simply returned. static std::shared_ptr createSortedTree( std::shared_ptr qet, - const vector& sortColumns); + const std::vector& sortColumns); // Similar to `createSortedTree` (see directly above), but create the sorted // trees for two different trees, the sort columns of which are specified as // a vector of two-dimensional arrays. This format often appears in // `QueryPlanner.cpp`. static std::pair, - shared_ptr> + std::shared_ptr> createSortedTrees(std::shared_ptr qetA, std::shared_ptr qetB, - const vector>& sortColumns); + const std::vector>& sortColumns); // The return type of the `getSortedTreesAndJoinColumns` function below. It is // deliberately stored as a tuple vs. a struct with named members, so that we @@ -233,7 +193,6 @@ class QueryExecutionTree { QueryExecutionContext* qec_; // No ownership std::shared_ptr rootOperation_ = nullptr; // Owned child. Will be deleted at deconstruction. - OperationType type_ = OperationType::UNDEFINED; std::optional sizeEstimate_ = std::nullopt; bool isRoot_ = false; // used to distinguish the root from child // operations/subtrees when pinning only the result. diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index 3f56dcaeea..65d1b040ea 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -487,7 +487,7 @@ std::vector QueryPlanner::optimize( // it might be, that we have not yet applied all the filters // (it might be, that the last join was optional and introduced new variables) if (!candidatePlans.empty()) { - applyFiltersIfPossible(candidatePlans[0], rootPattern->_filters, true); + applyFiltersIfPossible(candidatePlans[0], rootPattern->_filters); checkCancellation(); } @@ -1344,9 +1344,10 @@ string QueryPlanner::getPruningKey( } // _____________________________________________________________________________ +template void QueryPlanner::applyFiltersIfPossible( - vector& row, const vector& filters, - bool replace) const { + vector& row, + const vector& filters) const { // Apply every filter possible. // It is possible when, // 1) the filter has not already been applied @@ -1371,11 +1372,12 @@ void QueryPlanner::applyFiltersIfPossible( // in one go. Changing `row` inside the loop would invalidate the iterators. std::vector addedPlans; for (auto& plan : row) { - if (plan._qet->getType() == QueryExecutionTree::SCAN && - plan._qet->getResultWidth() == 3 && !replace) { - // Do not apply filters to dummies, except at the very end of query - // planning. - continue; + if constexpr (!replace) { + if (plan._qet->getRootOperation()->isIndexScanWithNumVariables(3)) { + // Do not apply filters to dummies, except at the very end of query + // planning. + continue; + } } for (size_t i = 0; i < filters.size(); ++i) { if (((plan._idsOfIncludedFilters >> i) & 1) != 0) { @@ -1393,7 +1395,7 @@ void QueryPlanner::applyFiltersIfPossible( newPlan._idsOfIncludedFilters |= (size_t(1) << i); newPlan._idsOfIncludedNodes = plan._idsOfIncludedNodes; newPlan.type = plan.type; - if (replace) { + if constexpr (replace) { plan = std::move(newPlan); } else { addedPlans.push_back(std::move(newPlan)); @@ -1414,7 +1416,7 @@ QueryPlanner::runDynamicProgrammingOnConnectedComponent( // (there might be duplicates because we already have multiple candidates // for each index scan with different permutations. dpTab.push_back(std::move(connectedComponent)); - applyFiltersIfPossible(dpTab.back(), filters, false); + applyFiltersIfPossible(dpTab.back(), filters); ad_utility::HashSet uniqueNodeIds; std::ranges::copy( dpTab.back() | std::views::transform(&SubtreePlan::_idsOfIncludedNodes), @@ -1429,7 +1431,7 @@ QueryPlanner::runDynamicProgrammingOnConnectedComponent( checkCancellation(); auto newPlans = merge(dpTab[i - 1], dpTab[k - i - 1], tg); dpTab[k - 1].insert(dpTab[k - 1].end(), newPlans.begin(), newPlans.end()); - applyFiltersIfPossible(dpTab.back(), filters, false); + applyFiltersIfPossible(dpTab.back(), filters); } // As we only passed in connected components, we expect the result to always // be nonempty. @@ -1467,7 +1469,7 @@ vector> QueryPlanner::fillDpTab( } if (numConnectedComponents == 1) { // A Cartesian product is not needed if there is only one component. - applyFiltersIfPossible(lastDpRowFromComponents.back(), filters, true); + applyFiltersIfPossible(lastDpRowFromComponents.back(), filters); return lastDpRowFromComponents; } // More than one connected component, set up a Cartesian product. @@ -1483,7 +1485,7 @@ vector> QueryPlanner::fillDpTab( std::back_inserter(subtrees)); result.at(0).push_back( makeSubtreePlan(_qec, std::move(subtrees))); - applyFiltersIfPossible(result.at(0), filters, true); + applyFiltersIfPossible(result.at(0), filters); return result; } @@ -1793,9 +1795,6 @@ std::vector QueryPlanner::createJoinCandidates( const auto& b = !swapForTesting ? bin : ain; std::vector candidates; - // We often query for the type of an operation, so we shorten these checks. - using enum QueryExecutionTree::OperationType; - // TODO find out, what is ACTUALLY the use case for the triple // graph. Is it only meant for (questionable) performance reasons // or does it change the meaning. @@ -1849,8 +1848,8 @@ std::vector QueryPlanner::createJoinCandidates( // CASE: JOIN ON ONE COLUMN ONLY. // Skip if we have two operations, where all three positions are variables. - if (a._qet->getType() == SCAN && a._qet->getResultWidth() == 3 && - b._qet->getType() == SCAN && b._qet->getResultWidth() == 3) { + if (a._qet->getRootOperation()->isIndexScanWithNumVariables(3) && + b._qet->getRootOperation()->isIndexScanWithNumVariables(3)) { return candidates; } @@ -1883,18 +1882,16 @@ auto QueryPlanner::createJoinWithTransitivePath( SubtreePlan a, SubtreePlan b, const std::vector>& jcs) -> std::optional { - using enum QueryExecutionTree::OperationType; - const bool aIsTransPath = a._qet->getType() == TRANSITIVE_PATH; - const bool bIsTransPath = b._qet->getType() == TRANSITIVE_PATH; + auto aTransPath = + std::dynamic_pointer_cast(a._qet->getRootOperation()); + auto bTransPath = + std::dynamic_pointer_cast(b._qet->getRootOperation()); - if (!(aIsTransPath || bIsTransPath)) { + if (!(aTransPath || bTransPath)) { return std::nullopt; } - std::shared_ptr otherTree = - aIsTransPath ? b._qet : a._qet; - auto& transPathTree = aIsTransPath ? a._qet : b._qet; - auto transPathOperation = std::dynamic_pointer_cast( - transPathTree->getRootOperation()); + std::shared_ptr otherTree = aTransPath ? b._qet : a._qet; + auto transPathOperation = aTransPath ? aTransPath : bTransPath; // TODO: Handle the case of two or more common variables if (jcs.size() > 1) { @@ -1902,8 +1899,8 @@ auto QueryPlanner::createJoinWithTransitivePath( "Transitive Path operation with more than" " two common variables is not supported"); } - const size_t otherCol = aIsTransPath ? jcs[0][1] : jcs[0][0]; - const size_t thisCol = aIsTransPath ? jcs[0][0] : jcs[0][1]; + const size_t otherCol = aTransPath ? jcs[0][1] : jcs[0][0]; + const size_t thisCol = aTransPath ? jcs[0][0] : jcs[0][1]; // Do not bind the side of a path twice if (transPathOperation->isBoundOrId()) { return std::nullopt; @@ -1934,11 +1931,14 @@ auto QueryPlanner::createJoinWithHasPredicateScan( // If the join column corresponds to the has-predicate scan's // subject column we can use a specialized join that avoids // loading the full has-predicate predicate. - using enum QueryExecutionTree::OperationType; auto isSuitablePredicateScan = [](const auto& tree, size_t joinColumn) { - return tree._qet->getType() == HAS_PREDICATE_SCAN && joinColumn == 0 && - static_cast(tree._qet->getRootOperation().get()) - ->getType() == HasPredicateScan::ScanType::FULL_SCAN; + if (joinColumn == 0) { + auto rootOperation = std::dynamic_pointer_cast( + tree._qet->getRootOperation()); + return rootOperation && + rootOperation->getType() == HasPredicateScan::ScanType::FULL_SCAN; + } + return false; }; const bool aIsSuitablePredicateScan = isSuitablePredicateScan(a, jcs[0][0]); diff --git a/src/engine/QueryPlanner.h b/src/engine/QueryPlanner.h index 4078bd04c4..a08cf48e02 100644 --- a/src/engine/QueryPlanner.h +++ b/src/engine/QueryPlanner.h @@ -374,9 +374,9 @@ class QueryPlanner { const SubtreePlan& plan, const vector& orderedOnColumns) const; - [[nodiscard]] void applyFiltersIfPossible( - std::vector& row, const std::vector& filters, - bool replaceInsteadOfAddPlans) const; + template + void applyFiltersIfPossible(std::vector& row, + const std::vector& filters) const; /** * @brief Optimize a set of triples, filters and precomputed candidates diff --git a/src/parser/Literal.h b/src/parser/Literal.h index 6e40d4e0c8..5367c261ae 100644 --- a/src/parser/Literal.h +++ b/src/parser/Literal.h @@ -4,6 +4,9 @@ #pragma once +#include +#include + #include "parser/Iri.h" #include "parser/NormalizedString.h" diff --git a/src/util/Conversions.h b/src/util/Conversions.h index c49f877c41..bbd4e901f2 100644 --- a/src/util/Conversions.h +++ b/src/util/Conversions.h @@ -4,16 +4,14 @@ #pragma once -#include #include #include #include "parser/LiteralOrIri.h" -#include "util/StringUtils.h" namespace ad_utility { -static constexpr std::string_view languageTaggedPredicatePrefix = "@"; +constexpr std::string_view languageTaggedPredicatePrefix = "@"; //! Convert a language tag like "@en" to the corresponding entity uri //! for the efficient language filter. // TODO The overload that takes and returns `std::string` can be diff --git a/test/SparqlParserTest.cpp b/test/SparqlParserTest.cpp index e67c9857f8..529d10c144 100644 --- a/test/SparqlParserTest.cpp +++ b/test/SparqlParserTest.cpp @@ -8,10 +8,11 @@ #include -#include "./util/TripleComponentTestHelpers.h" #include "SparqlAntlrParserTestHelpers.h" #include "global/Constants.h" #include "parser/SparqlParser.h" +#include "util/Conversions.h" +#include "util/TripleComponentTestHelpers.h" namespace m = matchers; namespace p = parsedQuery; diff --git a/test/engine/CartesianProductJoinTest.cpp b/test/engine/CartesianProductJoinTest.cpp index 26782ad410..33eebf82c3 100644 --- a/test/engine/CartesianProductJoinTest.cpp +++ b/test/engine/CartesianProductJoinTest.cpp @@ -5,12 +5,12 @@ #include #include +#include "../engine/ValuesForTesting.h" #include "../util/GTestHelpers.h" #include "../util/IdTableHelpers.h" #include "../util/IndexTestHelpers.h" #include "engine/CartesianProductJoin.h" #include "engine/QueryExecutionTree.h" -#include "engine/ValuesForTesting.h" using namespace ad_utility::testing; using ad_utility::source_location; diff --git a/src/engine/ValuesForTesting.h b/test/engine/ValuesForTesting.h similarity index 100% rename from src/engine/ValuesForTesting.h rename to test/engine/ValuesForTesting.h diff --git a/test/util/IdTableHelpers.cpp b/test/util/IdTableHelpers.cpp index 99360cd8a9..bdb0c3a865 100644 --- a/test/util/IdTableHelpers.cpp +++ b/test/util/IdTableHelpers.cpp @@ -4,12 +4,10 @@ #include "../test/util/IdTableHelpers.h" -#include - #include #include -#include "engine/ValuesForTesting.h" +#include "../engine/ValuesForTesting.h" #include "engine/idTable/IdTable.h" #include "global/ValueId.h" #include "util/Algorithm.h" diff --git a/test/util/IdTableHelpers.h b/test/util/IdTableHelpers.h index 37a6f71846..75d34bad0a 100644 --- a/test/util/IdTableHelpers.h +++ b/test/util/IdTableHelpers.h @@ -12,6 +12,7 @@ #include #include +#include "../engine/ValuesForTesting.h" #include "./AllocatorTestHelpers.h" #include "./GTestHelpers.h" #include "./IdTestHelpers.h" @@ -20,7 +21,6 @@ #include "engine/Join.h" #include "engine/OptionalJoin.h" #include "engine/QueryExecutionTree.h" -#include "engine/ValuesForTesting.h" #include "engine/idTable/IdTable.h" #include "global/ValueId.h" #include "util/Algorithm.h" diff --git a/test/util/OperationTestHelpers.h b/test/util/OperationTestHelpers.h index 7c7402cb99..8764aab4b7 100644 --- a/test/util/OperationTestHelpers.h +++ b/test/util/OperationTestHelpers.h @@ -85,10 +85,4 @@ class ShallowParentOperation : public Operation { } }; -template <> -void QueryExecutionTree::setOperation( - std::shared_ptr operation) { - rootOperation_ = std::move(operation); -} - #endif // QLEVER_OPERATIONTESTHELPERS_H