diff --git a/src/engine/CheckUsePatternTrick.cpp b/src/engine/CheckUsePatternTrick.cpp index 98f41aebb2..622d629142 100644 --- a/src/engine/CheckUsePatternTrick.cpp +++ b/src/engine/CheckUsePatternTrick.cpp @@ -77,6 +77,82 @@ bool isVariableContainedInGraphPatternOperation( }); } +// Internal helper function. +// Modify the `triples` s.t. the patterns for `subAndPred.subject_` will appear +// in a column with the variable `subAndPred.predicate_` when evaluating and +// joining all the triples. This can be either done by retrieving one of the +// additional columns where the patterns are stored in the PSO and POS +// permutation or, if no triple suitable for adding this column exists, by +// adding a triple `?subject ql:has-pattern ?predicate`. +static void rewriteTriplesForPatternTrick(const PatternTrickTuple& subAndPred, + std::vector& triples) { + // The following lambda tries to find a triple in the `triples` that has the + // subject variable of the pattern trick in its `triplePosition` (which is + // either the subject or the object) and a fixed predicate (no variable). If + // such a triple is found, it is modified s.t. it also scans the + // `additionalScanColumn` which has to be the index of the column where the + // patterns of the `triplePosition` are stored in the POS and PSO permutation. + // Return true iff such a triple was found and replaced. + auto findAndRewriteMatchingTriple = [&subAndPred, &triples]( + auto triplePosition, + size_t additionalScanColumn) { + auto matchingTriple = std::ranges::find_if( + triples, [&subAndPred, triplePosition](const SparqlTriple& t) { + return std::invoke(triplePosition, t) == subAndPred.subject_ && + t._p.isIri() && !isVariable(t._p); + }); + if (matchingTriple == triples.end()) { + return false; + } + matchingTriple->_additionalScanColumns.emplace_back(additionalScanColumn, + subAndPred.predicate_); + return true; + }; + + if (findAndRewriteMatchingTriple(&SparqlTriple::_s, + ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN)) { + return; + } else if (findAndRewriteMatchingTriple( + &SparqlTriple::_o, ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN)) { + return; + } else { + // We could not find a suitable triple to append the additional column, we + // therefore add an explicit triple `?s ql:has_pattern ?p` + triples.emplace_back(subAndPred.subject_, HAS_PATTERN_PREDICATE, + subAndPred.predicate_); + } +} + +// Helper function for `checkUsePatternTrick`. +// Check if any of the triples in the `graphPattern` has the form `?s +// ql:has-predicate ?p` or `?s ?p ?o` and that the other conditions for the +// pattern trick are fulfilled (nameley that the variables `?p` and if present +// `?o` don't appear elsewhere in the `parsedQuery`. If such a triple is found, +// the query is modified such that it behaves as if the triple was replace by +// `?s ql:has-pattern ?p`. See the documentation of +// `rewriteTriplesForPatternTrick` above. +static std::optional findPatternTrickTuple( + p::BasicGraphPattern* graphPattern, const ParsedQuery* parsedQuery, + const std::optional< + sparqlExpression::SparqlExpressionPimpl::VariableAndDistinctness>& + countedVariable) { + // Try to find a triple that either has `ql:has-predicate` as the predicate, + // or consists of three variables, and fulfills all the other preconditions + // for the pattern trick. + auto& triples = graphPattern->_triples; + for (auto it = triples.begin(); it != triples.end(); ++it) { + auto patternTrickTuple = + isTripleSuitableForPatternTrick(*it, parsedQuery, countedVariable); + if (!patternTrickTuple.has_value()) { + continue; + } + triples.erase(it); + rewriteTriplesForPatternTrick(patternTrickTuple.value(), triples); + return patternTrickTuple; + } + return std::nullopt; +} + // ____________________________________________________________________________ std::optional checkUsePatternTrick( ParsedQuery* parsedQuery) { @@ -109,19 +185,10 @@ std::optional checkUsePatternTrick( continue; } - // Try to find a triple that either has `ql:has-predicate` as the predicate, - // or consists of three variables, and fulfills all the other preconditions - // for the pattern trick. - auto& triples = curPattern->_triples; - for (auto it = triples.begin(); it != triples.end(); ++it) { - auto patternTrickTuple = - isTripleSuitableForPatternTrick(*it, parsedQuery, countedVariable); - if (patternTrickTuple.has_value()) { - // Remove the triple from the graph. Note that this invalidates the - // reference `triple`, so we perform this step at the very end. - triples.erase(it); - return patternTrickTuple; - } + auto patternTrickTuple = + findPatternTrickTuple(curPattern, parsedQuery, countedVariable); + if (patternTrickTuple.has_value()) { + return patternTrickTuple; } } // No suitable triple for the pattern trick was found. diff --git a/src/engine/CheckUsePatternTrick.h b/src/engine/CheckUsePatternTrick.h index 8f2d37ac4f..47db399638 100644 --- a/src/engine/CheckUsePatternTrick.h +++ b/src/engine/CheckUsePatternTrick.h @@ -19,8 +19,13 @@ struct PatternTrickTuple { * @brief Determines if the pattern trick (and in turn the * CountAvailablePredicates operation) is applicable to the given * parsed query. If a ql:has-predicate triple is found and - * CountAvailablePredicates can be used for it, the triple will be removed from - * the parsed query. + * CountAvailablePredicates can be used for it, the triple's predicate will be + * replaced by `ql:has-pattern`. If possible, then this rewrite is performed by + * completely removing the triple and adding the pattern as an + * additional scan column to one of the other triples (note that we have folded + * the patterns for the subject and object into the PSO and POS permutation). + * The mapping from the pattern to the predicates contained in that pattern will + * later be done by the `CountAvailablePredicates` operation. */ std::optional checkUsePatternTrick(ParsedQuery* parsedQuery); diff --git a/src/engine/CountAvailablePredicates.cpp b/src/engine/CountAvailablePredicates.cpp index e946a7446a..fd6e874d7a 100644 --- a/src/engine/CountAvailablePredicates.cpp +++ b/src/engine/CountAvailablePredicates.cpp @@ -2,19 +2,11 @@ // Chair of Algorithms and Data Structures. // Author: Florian Kramer (florian.kramer@neptun.uni-freiburg.de) -#include "./CountAvailablePredicates.h" +#include "engine/CountAvailablePredicates.h" -#include "./CallFixedSize.h" - -// _____________________________________________________________________________ -CountAvailablePredicates::CountAvailablePredicates(QueryExecutionContext* qec, - Variable predicateVariable, - Variable countVariable) - : Operation(qec), - _subtree(nullptr), - _subjectColumnIndex(0), - _predicateVariable(std::move(predicateVariable)), - _countVariable(std::move(countVariable)) {} +#include "engine/CallFixedSize.h" +#include "engine/IndexScan.h" +#include "index/IndexImpl.h" // _____________________________________________________________________________ CountAvailablePredicates::CountAvailablePredicates( @@ -22,27 +14,27 @@ CountAvailablePredicates::CountAvailablePredicates( size_t subjectColumnIndex, Variable predicateVariable, Variable countVariable) : Operation(qec), - _subtree(QueryExecutionTree::createSortedTree(std::move(subtree), + subtree_(QueryExecutionTree::createSortedTree(std::move(subtree), {subjectColumnIndex})), - _subjectColumnIndex(subjectColumnIndex), - _predicateVariable(std::move(predicateVariable)), - _countVariable(std::move(countVariable)) {} + subjectColumnIndex_(subjectColumnIndex), + predicateVariable_(std::move(predicateVariable)), + countVariable_(std::move(countVariable)) {} // _____________________________________________________________________________ string CountAvailablePredicates::getCacheKeyImpl() const { std::ostringstream os; - if (_subtree == nullptr) { + if (subtree_ == nullptr) { os << "COUNT_AVAILABLE_PREDICATES for all entities"; } else { - os << "COUNT_AVAILABLE_PREDICATES (col " << _subjectColumnIndex << ")\n" - << _subtree->getCacheKey(); + os << "COUNT_AVAILABLE_PREDICATES (col " << subjectColumnIndex_ << ")\n" + << subtree_->getCacheKey(); } return std::move(os).str(); } // _____________________________________________________________________________ string CountAvailablePredicates::getDescriptor() const { - if (_subtree == nullptr) { + if (subtree_ == nullptr) { return "CountAvailablePredicates for a all entities"; } return "CountAvailablePredicates"; @@ -62,8 +54,8 @@ VariableToColumnMap CountAvailablePredicates::computeVariableToColumnMap() const { VariableToColumnMap varCols; auto col = makeAlwaysDefinedColumn; - varCols[_predicateVariable] = col(0); - varCols[_countVariable] = col(1); + varCols[predicateVariable_] = col(0); + varCols[countVariable_] = col(1); return varCols; } @@ -77,14 +69,14 @@ float CountAvailablePredicates::getMultiplicity([[maybe_unused]] size_t col) { // _____________________________________________________________________________ uint64_t CountAvailablePredicates::getSizeEstimateBeforeLimit() { - if (_subtree.get() != nullptr) { + if (subtree_.get() != nullptr) { // Predicates are only computed for entities in the subtrees result. // This estimate is probably wildly innacurrate, but as it does not // depend on the order of operations of the subtree should be sufficient // for the type of optimizations the optimizer can currently do. - size_t num_distinct = _subtree->getSizeEstimate() / - _subtree->getMultiplicity(_subjectColumnIndex); + size_t num_distinct = subtree_->getSizeEstimate() / + subtree_->getMultiplicity(subjectColumnIndex_); return num_distinct / getIndex().getAvgNumDistinctSubjectsPerPredicate(); } else { // Predicates are counted for all entities. In this case the size estimate @@ -96,11 +88,11 @@ uint64_t CountAvailablePredicates::getSizeEstimateBeforeLimit() { // _____________________________________________________________________________ size_t CountAvailablePredicates::getCostEstimate() { - if (_subtree.get() != nullptr) { + if (subtree_.get() != nullptr) { // Without knowing the ratio of elements that will have a pattern assuming // constant cost per entry should be reasonable (altough non distinct // entries are of course actually cheaper). - return _subtree->getCostEstimate() + _subtree->getSizeEstimate(); + return subtree_->getCostEstimate() + subtree_->getSizeEstimate(); } else { // the cost is proportional to the number of elements we need to write. return getSizeEstimateBeforeLimit(); @@ -113,68 +105,84 @@ ResultTable CountAvailablePredicates::computeResult() { IdTable idTable{getExecutionContext()->getAllocator()}; idTable.setNumColumns(2); - const std::vector& hasPattern = - _executionContext->getIndex().getHasPattern(); - const CompactVectorOfStrings& hasPredicate = - _executionContext->getIndex().getHasPredicate(); const CompactVectorOfStrings& patterns = _executionContext->getIndex().getPatterns(); - if (_subtree == nullptr) { + AD_CORRECTNESS_CHECK(subtree_); + // Determine whether we can perform the full scan optimization. It can be + // applied if the `subtree_` is a single index scan of a triple + // `?s ql:has-pattern ?p`. + // TODO As soon as we have a lazy implementation for all index scans + // or even all operations Then the special case for all entities can be + // removed. + bool isPatternTrickForAllEntities = [&]() { + auto indexScan = + dynamic_cast(subtree_->getRootOperation().get()); + if (!indexScan) { + return false; + } + if (!indexScan->getSubject().isVariable() || + !indexScan->getObject().isVariable()) { + return false; + } + + return indexScan->getPredicate() == HAS_PATTERN_PREDICATE; + }(); + + if (isPatternTrickForAllEntities) { + subtree_->getRootOperation()->updateRuntimeInformationWhenOptimizedOut( + RuntimeInformation::Status::lazilyMaterialized); // Compute the predicates for all entities - CountAvailablePredicates::computePatternTrickAllEntities( - &idTable, hasPattern, hasPredicate, patterns); + CountAvailablePredicates::computePatternTrickAllEntities(&idTable, + patterns); return {std::move(idTable), resultSortedOn(), LocalVocab{}}; } else { - std::shared_ptr subresult = _subtree->getResult(); + std::shared_ptr subresult = subtree_->getResult(); LOG(DEBUG) << "CountAvailablePredicates subresult computation done." << std::endl; size_t width = subresult->idTable().numColumns(); + size_t patternColumn = subtree_->getVariableColumn(predicateVariable_); CALL_FIXED_SIZE(width, &computePatternTrick, subresult->idTable(), &idTable, - hasPattern, hasPredicate, patterns, _subjectColumnIndex, + patterns, subjectColumnIndex_, patternColumn, runtimeInfo()); return {std::move(idTable), resultSortedOn(), subresult->getSharedLocalVocab()}; } } +// _____________________________________________________________________________ void CountAvailablePredicates::computePatternTrickAllEntities( - IdTable* dynResult, const vector& hasPattern, - const CompactVectorOfStrings& hasPredicate, - const CompactVectorOfStrings& patterns) { + IdTable* dynResult, const CompactVectorOfStrings& patterns) const { IdTableStatic<2> result = std::move(*dynResult).toStatic<2>(); LOG(DEBUG) << "For all entities." << std::endl; ad_utility::HashMap predicateCounts; ad_utility::HashMap patternCounts; - - size_t maxId = std::max(hasPattern.size(), hasPredicate.size()); - for (size_t i = 0; i < maxId; i++) { - if (i < hasPattern.size() && hasPattern[i] != NO_PATTERN) { - patternCounts[hasPattern[i]]++; - } else if (i < hasPredicate.size()) { - auto predicates = hasPredicate[i]; - for (const auto& predicate : predicates) { - auto it = predicateCounts.find(predicate); - if (it == predicateCounts.end()) { - predicateCounts[predicate] = 1; - } else { - it->second++; - } - } + auto fullHasPattern = + getExecutionContext() + ->getIndex() + .getImpl() + .getPermutation(Permutation::Enum::PSO) + .lazyScan(qlever::specialIds.at(HAS_PATTERN_PREDICATE), std::nullopt, + std::nullopt, {}, cancellationHandle_); + for (const auto& idTable : fullHasPattern) { + for (const auto& patternId : idTable.getColumn(1)) { + AD_CORRECTNESS_CHECK(patternId.getDatatype() == Datatype::Int); + patternCounts[patternId.getInt()]++; } } LOG(DEBUG) << "Using " << patternCounts.size() - << " patterns for computing the result." << std::endl; - for (const auto& it : patternCounts) { - for (const auto& predicate : patterns[it.first]) { - predicateCounts[predicate] += it.second; + << " patterns for computing the result" << std::endl; + for (const auto& [patternIdx, count] : patternCounts) { + AD_CORRECTNESS_CHECK(patternIdx < patterns.size()); + for (const auto& predicate : patterns[patternIdx]) { + predicateCounts[predicate] += count; } } result.reserve(predicateCounts.size()); - for (const auto& it : predicateCounts) { - result.push_back({it.first, Id::makeFromInt(it.second)}); + for (const auto& [predicateId, count] : predicateCounts) { + result.push_back({predicateId, Id::makeFromInt(count)}); } *dynResult = std::move(result).toDynamic(); } @@ -199,17 +207,16 @@ class MergeableHashMap : public ad_utility::HashMap { } }; +// _____________________________________________________________________________ template void CountAvailablePredicates::computePatternTrick( const IdTable& dynInput, IdTable* dynResult, - const vector& hasPattern, - const CompactVectorOfStrings& hasPredicate, - const CompactVectorOfStrings& patterns, const size_t subjectColumn, - RuntimeInformation& runtimeInfo) { + const CompactVectorOfStrings& patterns, const size_t subjectColumnIdx, + const size_t patternColumnIdx, RuntimeInformation& runtimeInfo) { const IdTableView input = dynInput.asStaticView(); IdTableStatic<2> result = std::move(*dynResult).toStatic<2>(); LOG(DEBUG) << "For " << input.size() << " entities in column " - << subjectColumn << std::endl; + << subjectColumnIdx << std::endl; MergeableHashMap predicateCounts; MergeableHashMap patternCounts; @@ -229,6 +236,8 @@ void CountAvailablePredicates::computePatternTrick( size_t numListPredicates = 0; if (input.size() > 0) { // avoid strange OpenMP segfaults on GCC + decltype(auto) subjectColumn = input.getColumn(subjectColumnIdx); + decltype(auto) patternColumn = input.getColumn(patternColumnIdx); #pragma omp parallel #pragma omp single #pragma omp taskloop grainsize(500000) default(none) \ @@ -236,43 +245,14 @@ void CountAvailablePredicates::computePatternTrick( reduction(MergeHashmapsSizeT : patternCounts) \ reduction(+ : numEntitiesWithPatterns) reduction(+ : numPatternPredicates) \ reduction(+ : numListPredicates) \ - shared(input, subjectColumn, hasPattern, hasPredicate) - for (size_t inputIdx = 0; inputIdx < input.size(); ++inputIdx) { + shared(input, subjectColumn, patternColumn) + for (size_t i = 0; i < input.size(); ++i) { // Skip over elements with the same subject (don't count them twice) - Id subjectId = input(inputIdx, subjectColumn); - if (inputIdx > 0 && subjectId == input(inputIdx - 1, subjectColumn)) { + Id subjectId = subjectColumn[i]; + if (i > 0 && subjectId == subjectColumn[i - 1]) { continue; } - if (subjectId.getDatatype() != Datatype::VocabIndex) { - // Ignore numeric literals and other types that are folded into - // the value IDs. They can never be subjects and thus also have no - // patterns. - continue; - } - auto subject = subjectId.getVocabIndex().get(); - - if (subject < hasPattern.size() && hasPattern[subject] != NO_PATTERN) { - // The subject matches a pattern - patternCounts[hasPattern[subject]]++; - numEntitiesWithPatterns++; - } else if (subject < hasPredicate.size()) { - // The subject does not match a pattern - const auto& pattern = hasPredicate[subject]; - numListPredicates += pattern.size(); - if (!pattern.empty()) { - for (const auto& predicate : pattern) { - predicateCounts[predicate]++; - } - } else { - LOG(TRACE) << "No pattern or has-relation entry found for entity " - << std::to_string(subject) << std::endl; - } - } else { - LOG(TRACE) << "Subject " << subject - << " does not appear to be an entity " - "(its id is to high)." - << std::endl; - } + patternCounts[patternColumn[i].getInt()]++; } } LOG(DEBUG) << "Using " << patternCounts.size() @@ -288,6 +268,7 @@ void CountAvailablePredicates::computePatternTrick( LOG(DEBUG) << "Start translating pattern counts to predicate counts" << std::endl; + bool illegalPatternIndexFound = false; if (patternVec.begin() != patternVec.end()) { // avoid segfaults with OpenMP on GCC #pragma omp parallel @@ -296,13 +277,22 @@ void CountAvailablePredicates::computePatternTrick( reduction(MergeHashmapsId : predicateCounts) \ reduction(+ : numPredicatesSubsumedInPatterns) \ reduction(+ : numEntitiesWithPatterns) reduction(+ : numPatternPredicates) \ - reduction(+ : numListPredicates) shared(patternVec, patterns) + reduction(+ : numListPredicates) shared(patternVec, patterns) \ + reduction(|| : illegalPatternIndexFound) // TODO When we use iterators (`patternVec.begin()`) for the loop, // there is a strange warning on clang15 when OpenMP is activated. Find out // whether this is a known issue and whether this will be fixed in later // versions of clang. for (size_t i = 0; i != patternVec.size(); ++i) { auto [patternIndex, patternCount] = patternVec[i]; + // TODO As soon as we have a better way of handling the + // parallelism, the following block can become a simple AD_CONTRACT_CHECK. + if (patternIndex >= patterns.size()) { + if (patternIndex != NO_PATTERN) { + illegalPatternIndexFound = true; + } + continue; + } const auto& pattern = patterns[patternIndex]; numPatternPredicates += pattern.size(); for (const auto& predicate : pattern) { @@ -311,6 +301,7 @@ void CountAvailablePredicates::computePatternTrick( } } } + AD_CONTRACT_CHECK(!illegalPatternIndexFound); LOG(DEBUG) << "Finished translating pattern counts to predicate counts" << std::endl; // write the predicate counts to the result @@ -349,7 +340,7 @@ void CountAvailablePredicates::computePatternTrick( LOG(DEBUG) << "The conceptual cost with patterns was " << costWithPatterns << " vs " << costWithoutPatterns << " without patterns" << std::endl; - // Print the cost improvement using the the pattern trick gave us + // Print the cost improvement using the pattern trick gave us LOG(DEBUG) << "This gives a ratio with to without of " << costRatio << std::endl; diff --git a/src/engine/CountAvailablePredicates.h b/src/engine/CountAvailablePredicates.h index 78b7775ca4..10d6468529 100644 --- a/src/engine/CountAvailablePredicates.h +++ b/src/engine/CountAvailablePredicates.h @@ -23,23 +23,15 @@ using std::vector; // specified input column as its subject. The second output column contains a // count of how many of the input entities fulfill that requirement for that // predicate. This operation requires the use of the usePatterns option both -// when building as well as when loading the index. +// when building and when loading the index. class CountAvailablePredicates : public Operation { private: - std::shared_ptr _subtree; - size_t _subjectColumnIndex; - Variable _predicateVariable; - Variable _countVariable; + std::shared_ptr subtree_; + size_t subjectColumnIndex_; + Variable predicateVariable_; + Variable countVariable_; public: - /** - * @brief Creates a new CountAvailablePredicates operation that returns - * predicates and their counts for all entities. - */ - explicit CountAvailablePredicates(QueryExecutionContext* qec, - Variable predicateVariable, - Variable countVariable); - /** * @brief Creates a new CountAvailablePredicates operation that returns * predicates and their counts for the entities in column subjectColumnIndex @@ -62,18 +54,18 @@ class CountAvailablePredicates : public Operation { vector getChildren() override { using R = vector; - return _subtree != nullptr ? R{_subtree.get()} : R{}; + return subtree_ != nullptr ? R{subtree_.get()} : R{}; } void setTextLimit(size_t limit) override { - if (_subtree != nullptr) { - _subtree->setTextLimit(limit); + if (subtree_ != nullptr) { + subtree_->setTextLimit(limit); } } bool knownEmptyResult() override { - if (_subtree != nullptr) { - return _subtree->knownEmptyResult(); + if (subtree_ != nullptr) { + return subtree_->knownEmptyResult(); } return false; } @@ -86,33 +78,37 @@ class CountAvailablePredicates : public Operation { public: size_t getCostEstimate() override; - // This method is declared here solely for unit testing purposes + // Getters for testing. + size_t subjectColumnIndex() const { return subjectColumnIndex_; } + const Variable& predicateVariable() const { return predicateVariable_; } + const Variable& countVariable() const { return countVariable_; } + + private: /** * @brief Computes all relations that have one of input[inputCol]'s entities * as a subject and counts the number of their occurrences. * @param input The input table of entity ids * @param result A table with two columns, one for predicate ids, * one for counts - * @param hasPattern A mapping from entity ids to pattern ids (or NO_PATTERN) - * @param hasPredicate A mapping from entity ids to sets of relations * @param patterns A mapping from pattern ids to patterns - * @param subjectColumn The column containing the entities for which the + * @param subjectColumnIdx The column containing the entities for which the * relations should be counted. + * @param patternColumnIdx The column containing the pattern IDs (previously + * obtained via a scan of the `ql:has-pattern` predicate. */ template - static void computePatternTrick( - const IdTable& input, IdTable* result, - const vector& hasPattern, - const CompactVectorOfStrings& hasPredicate, - const CompactVectorOfStrings& patterns, size_t subjectColumn, - RuntimeInformation& runtimeInfo); - - static void computePatternTrickAllEntities( - IdTable* result, const vector& hasPattern, - const CompactVectorOfStrings& hasPredicate, - const CompactVectorOfStrings& patterns); + static void computePatternTrick(const IdTable& input, IdTable* result, + const CompactVectorOfStrings& patterns, + size_t subjectColumnIdx, + size_t patternColumnIdx, + RuntimeInformation& runtimeInfo); + + // Special implementation for the full pattern trick. + // Perform a lazy scan over the full `ql:has-pattern` relation, + // and then count and expand the patterns. + void computePatternTrickAllEntities( + IdTable* result, const CompactVectorOfStrings& patterns) const; - private: ResultTable computeResult() override; [[nodiscard]] VariableToColumnMap computeVariableToColumnMap() const override; }; diff --git a/src/engine/HasPredicateScan.cpp b/src/engine/HasPredicateScan.cpp index 4a5b1aedf8..4b27cad667 100644 --- a/src/engine/HasPredicateScan.cpp +++ b/src/engine/HasPredicateScan.cpp @@ -1,83 +1,130 @@ // Copyright 2018, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Florian Kramer (florian.kramer@mail.uni-freiburg.de) +// Chair of Algorithms and Data Structures. +// Authors: (2018 - 2019) Florian Kramer (florian.kramer@mail.uni-freiburg.de) +// (2024 - ) Johannes Kalmbach (kalmbach@cs.uni-freiburg.de) -#include "HasPredicateScan.h" +#include "engine/HasPredicateScan.h" -#include "CallFixedSize.h" +#include "engine/AddCombinedRowToTable.h" +#include "engine/CallFixedSize.h" +#include "engine/IndexScan.h" +#include "engine/Join.h" +#include "index/IndexImpl.h" +#include "util/JoinAlgorithms/JoinColumnMapping.h" +// Assert that the `type` is a valid value for the `ScanType` enum. +static void checkType(HasPredicateScan::ScanType type) { + using enum HasPredicateScan::ScanType; + static constexpr std::array supportedTypes{FREE_O, FREE_S, SUBQUERY_S, + FULL_SCAN}; + AD_CORRECTNESS_CHECK(ad_utility::contains(supportedTypes, type)); +} + +// Helper function for the constructor of the `HasPredicateScan`. +// Return a join operation between the `subtree` and the triple `?subject +// ql:has-pattern ?object` where the subject is specified by the +// `subtreeColIndex` which is an index into the `subtree`'s result columns and +// the `?object` is specified directly via the `objectVariable`. +// Also return the column index of the `objectVariable` in the final result. +static constexpr auto makeJoin = + [](auto* qec, std::shared_ptr subtree, + ColumnIndex subtreeColIndex, const Variable& objectVariable) + -> HasPredicateScan::SubtreeAndColumnIndex { + const auto& subtreeVar = + subtree->getVariableAndInfoByColumnIndex(subtreeColIndex).first; + auto hasPatternScan = ad_utility::makeExecutionTree( + qec, Permutation::Enum::PSO, + SparqlTriple{subtreeVar, HAS_PATTERN_PREDICATE, objectVariable}); + auto joinedSubtree = ad_utility::makeExecutionTree( + qec, std::move(subtree), std::move(hasPatternScan), subtreeColIndex, 0); + auto column = + joinedSubtree->getVariableColumns().at(objectVariable).columnIndex_; + return {std::move(joinedSubtree), column}; +}; + +// ___________________________________________________________________________ HasPredicateScan::HasPredicateScan(QueryExecutionContext* qec, std::shared_ptr subtree, size_t subtreeJoinColumn, - std::string objectVariable) + Variable objectVariable) : Operation{qec}, - _type{ScanType::SUBQUERY_S}, - _subtree{std::move(subtree)}, - _subtreeJoinColumn{subtreeJoinColumn}, - _object{std::move(objectVariable)} {} + type_{ScanType::SUBQUERY_S}, + subtree_{makeJoin(qec, std::move(subtree), subtreeJoinColumn, + Variable{objectVariable})}, + object_{std::move(objectVariable)} {} -HasPredicateScan::HasPredicateScan(QueryExecutionContext* qec, - SparqlTriple triple) - : Operation{qec} { - // Just pick one direction, they should be equivalent. +// A small helper function that sanitizes the `triple` which is passed to the +// constructor of `HasPredicateScan` and determines the corresponding +// `ScanType`. +static HasPredicateScan::ScanType getScanType(const SparqlTriple& triple) { + using enum HasPredicateScan::ScanType; AD_CONTRACT_CHECK(triple._p._iri == HAS_PREDICATE_PREDICATE); - // TODO(schnelle): Handle ?p ql:has-predicate ?p - _type = [&]() { - if (isVariable(triple._s) && (isVariable(triple._o))) { - if (triple._s == triple._o) { - throw std::runtime_error{ - "ql:has-predicate with same variable for subject and object not " - "supported."}; - } - return ScanType::FULL_SCAN; - } else if (isVariable(triple._s)) { - return ScanType::FREE_S; - } else if (isVariable(triple._o)) { - return ScanType::FREE_O; - } else { - AD_FAIL(); + if (isVariable(triple._s) && (isVariable(triple._o))) { + if (triple._s == triple._o) { + throw std::runtime_error{ + "ql:has-predicate with same variable for subject and object not " + "supported."}; } - }(); - setSubject(triple._s); - setObject(triple._o); + return FULL_SCAN; + } else if (isVariable(triple._s)) { + return FREE_S; + } else if (isVariable(triple._o)) { + return FREE_O; + } else { + AD_FAIL(); + } } +// ___________________________________________________________________________ +HasPredicateScan::HasPredicateScan(QueryExecutionContext* qec, + SparqlTriple triple) + : Operation{qec}, + type_{getScanType(triple)}, + subject_{triple._s}, + object_{triple._o} {} + +// ___________________________________________________________________________ string HasPredicateScan::getCacheKeyImpl() const { std::ostringstream os; - switch (_type) { + checkType(type_); + switch (type_) { case ScanType::FREE_S: - os << "HAS_PREDICATE_SCAN with O = " << _object; + os << "HAS_PREDICATE_SCAN with O = " << object_; break; case ScanType::FREE_O: - os << "HAS_PREDICATE_SCAN with S = " << _subject; + os << "HAS_PREDICATE_SCAN with S = " << subject_; break; case ScanType::FULL_SCAN: os << "HAS_PREDICATE_SCAN for the full relation"; break; case ScanType::SUBQUERY_S: - os << "HAS_PREDICATE_SCAN with S = " << _subtree->getCacheKey(); + os << "HAS_PREDICATE_SCAN with S = " << subtree().getCacheKey(); break; } return std::move(os).str(); } +// ___________________________________________________________________________ string HasPredicateScan::getDescriptor() const { - switch (_type) { + checkType(type_); + switch (type_) { case ScanType::FREE_S: - return "HasPredicateScan free subject: " + _subject; + return "HasPredicateScan free subject: " + subject_.toRdfLiteral(); case ScanType::FREE_O: - return "HasPredicateScan free object: " + _object; + return "HasPredicateScan free object: " + object_.toRdfLiteral(); case ScanType::FULL_SCAN: return "HasPredicateScan full scan"; case ScanType::SUBQUERY_S: - return "HasPredicateScan with a subquery on " + _subject; + return "HasPredicateScan with a subquery on " + subject_.toRdfLiteral(); default: return "HasPredicateScan"; } } +// ___________________________________________________________________________ size_t HasPredicateScan::getResultWidth() const { - switch (_type) { + checkType(type_); + switch (type_) { case ScanType::FREE_S: return 1; case ScanType::FREE_O: @@ -85,13 +132,15 @@ size_t HasPredicateScan::getResultWidth() const { case ScanType::FULL_SCAN: return 2; case ScanType::SUBQUERY_S: - return _subtree->getResultWidth() + 1; + return subtree().getResultWidth(); } return -1; } +// ___________________________________________________________________________ vector HasPredicateScan::resultSortedOn() const { - switch (_type) { + checkType(type_); + switch (type_) { case ScanType::FREE_S: // is the lack of sorting here a problem? return {}; @@ -100,103 +149,105 @@ vector HasPredicateScan::resultSortedOn() const { case ScanType::FULL_SCAN: return {0}; case ScanType::SUBQUERY_S: - return _subtree->resultSortedOn(); + return subtree().resultSortedOn(); } return {}; } +// ___________________________________________________________________________ VariableToColumnMap HasPredicateScan::computeVariableToColumnMap() const { - VariableToColumnMap varCols; - using V = Variable; // All the columns that are newly created by this operation contain no // undefined values. auto col = makeAlwaysDefinedColumn; - switch (_type) { + checkType(type_); + switch (type_) { case ScanType::FREE_S: - // TODO Better types for `_subject` and `_object`. - varCols.emplace(std::make_pair(V{_subject}, col(0))); - break; + return {{subject_.getVariable(), col(0)}}; case ScanType::FREE_O: - varCols.insert(std::make_pair(V{_object}, col(0))); - break; + return {{object_.getVariable(), col(0)}}; case ScanType::FULL_SCAN: - varCols.insert(std::make_pair(V{_subject}, col(0))); - varCols.insert(std::make_pair(V{_object}, col(1))); - break; + return {{subject_.getVariable(), col(0)}, + {object_.getVariable(), col(1)}}; case ScanType::SUBQUERY_S: - varCols = _subtree->getVariableColumns(); - varCols.insert(std::make_pair(V{_object}, col(getResultWidth() - 1))); - break; + return subtree().getVariableColumns(); } - return varCols; + AD_FAIL(); } +// ___________________________________________________________________________ void HasPredicateScan::setTextLimit(size_t limit) { - if (_type == ScanType::SUBQUERY_S) { - _subtree->setTextLimit(limit); + if (type_ == ScanType::SUBQUERY_S) { + subtree().setTextLimit(limit); } } +// ___________________________________________________________________________ bool HasPredicateScan::knownEmptyResult() { - if (_type == ScanType::SUBQUERY_S) { - return _subtree->knownEmptyResult(); + if (type_ == ScanType::SUBQUERY_S) { + return subtree().knownEmptyResult(); } else { return false; } } +// ___________________________________________________________________________ float HasPredicateScan::getMultiplicity(size_t col) { - switch (_type) { + // Default value for columns about which we know nothing. + double result = 1.0; + switch (type_) { case ScanType::FREE_S: if (col == 0) { - return getIndex().getAvgNumDistinctPredicatesPerSubject(); + result = getIndex().getAvgNumDistinctPredicatesPerSubject(); } break; case ScanType::FREE_O: if (col == 0) { - return getIndex().getAvgNumDistinctSubjectsPerPredicate(); + result = getIndex().getAvgNumDistinctSubjectsPerPredicate(); } break; case ScanType::FULL_SCAN: if (col == 0) { - return getIndex().getAvgNumDistinctPredicatesPerSubject(); + result = getIndex().getAvgNumDistinctPredicatesPerSubject(); } else if (col == 1) { - return getIndex().getAvgNumDistinctSubjectsPerPredicate(); + result = getIndex().getAvgNumDistinctSubjectsPerPredicate(); } break; case ScanType::SUBQUERY_S: if (col < getResultWidth() - 1) { - return _subtree->getMultiplicity(col) * - getIndex().getAvgNumDistinctSubjectsPerPredicate(); + result = subtree().getMultiplicity(col) * + getIndex().getAvgNumDistinctSubjectsPerPredicate(); } else { - return _subtree->getMultiplicity(_subtreeJoinColumn) * - getIndex().getAvgNumDistinctSubjectsPerPredicate(); + result = subtree().getMultiplicity(subtreeColIdx()) * + getIndex().getAvgNumDistinctSubjectsPerPredicate(); } } - return 1; + return static_cast(result); } +// ___________________________________________________________________________ uint64_t HasPredicateScan::getSizeEstimateBeforeLimit() { - switch (_type) { + switch (type_) { case ScanType::FREE_S: - return static_cast( + return static_cast( getIndex().getAvgNumDistinctPredicatesPerSubject()); case ScanType::FREE_O: - return static_cast( + return static_cast( getIndex().getAvgNumDistinctSubjectsPerPredicate()); case ScanType::FULL_SCAN: return getIndex().getNumDistinctSubjectPredicatePairs(); case ScanType::SUBQUERY_S: - return _subtree->getSizeEstimate() * - getIndex().getAvgNumDistinctPredicatesPerSubject(); + return static_cast( + static_cast(subtree().getSizeEstimate()) * + getIndex().getAvgNumDistinctPredicatesPerSubject()); } return 0; } +// ___________________________________________________________________________ size_t HasPredicateScan::getCostEstimate() { // TODO: these size estimates only work if all predicates are functional - switch (_type) { + switch (type_) { case ScanType::FREE_S: return getSizeEstimateBeforeLimit(); case ScanType::FREE_O: @@ -204,221 +255,140 @@ size_t HasPredicateScan::getCostEstimate() { case ScanType::FULL_SCAN: return getSizeEstimateBeforeLimit(); case ScanType::SUBQUERY_S: - return _subtree->getCostEstimate() + getSizeEstimateBeforeLimit(); + return subtree().getCostEstimate() + getSizeEstimateBeforeLimit(); } return 0; } +// ___________________________________________________________________________ ResultTable HasPredicateScan::computeResult() { IdTable idTable{getExecutionContext()->getAllocator()}; idTable.setNumColumns(getResultWidth()); - const std::vector& hasPattern = getIndex().getHasPattern(); - const CompactVectorOfStrings& hasPredicate = getIndex().getHasPredicate(); const CompactVectorOfStrings& patterns = getIndex().getPatterns(); + auto hasPattern = + getExecutionContext() + ->getIndex() + .getImpl() + .getPermutation(Permutation::Enum::PSO) + .lazyScan(qlever::specialIds.at(HAS_PATTERN_PREDICATE), std::nullopt, + std::nullopt, {}, cancellationHandle_); - switch (_type) { + auto getId = [this](const TripleComponent tc) { + std::optional id = tc.toValueId(getIndex().getVocab()); + if (!id.has_value()) { + AD_THROW("The entity '" + tc.toRdfLiteral() + + "' required by `ql:has-predicate` is not in the vocabulary."); + } + return id.value(); + }; + switch (type_) { case ScanType::FREE_S: { - Id objectId; - if (!getIndex().getId(_object, &objectId)) { - AD_THROW("The predicate '" + _object + "' is not in the vocabulary."); - } - HasPredicateScan::computeFreeS(&idTable, objectId, hasPattern, - hasPredicate, patterns); + HasPredicateScan::computeFreeS(&idTable, getId(object_), hasPattern, + patterns); return {std::move(idTable), resultSortedOn(), LocalVocab{}}; }; case ScanType::FREE_O: { - Id subjectId; - if (!getIndex().getId(_subject, &subjectId)) { - AD_THROW("The subject " + _subject + " is not in the vocabulary."); - } - HasPredicateScan::computeFreeO(&idTable, subjectId, hasPattern, - hasPredicate, patterns); + HasPredicateScan::computeFreeO(&idTable, getId(subject_), patterns); return {std::move(idTable), resultSortedOn(), LocalVocab{}}; }; case ScanType::FULL_SCAN: HasPredicateScan::computeFullScan( - &idTable, hasPattern, hasPredicate, patterns, + &idTable, hasPattern, patterns, getIndex().getNumDistinctSubjectPredicatePairs()); return {std::move(idTable), resultSortedOn(), LocalVocab{}}; case ScanType::SUBQUERY_S: - std::shared_ptr subresult = _subtree->getResult(); - int inWidth = subresult->idTable().numColumns(); - int outWidth = idTable.numColumns(); - CALL_FIXED_SIZE((std::array{inWidth, outWidth}), - HasPredicateScan::computeSubqueryS, &idTable, - subresult->idTable(), _subtreeJoinColumn, hasPattern, - hasPredicate, patterns); - return {std::move(idTable), resultSortedOn(), - subresult->getSharedLocalVocab()}; + auto width = static_cast(idTable.numColumns()); + auto doCompute = [this, &idTable, &patterns]() { + return computeSubqueryS(&idTable, patterns); + }; + return ad_utility::callFixedSize(width, doCompute); } AD_FAIL(); } +// ___________________________________________________________________________ void HasPredicateScan::computeFreeS( - IdTable* resultTable, Id objectId, const std::vector& hasPattern, - const CompactVectorOfStrings& hasPredicate, + IdTable* resultTable, Id objectId, auto& hasPattern, const CompactVectorOfStrings& patterns) { IdTableStatic<1> result = std::move(*resultTable).toStatic<1>(); - uint64_t entityIndex = 0; - while (entityIndex < hasPattern.size() || entityIndex < hasPredicate.size()) { - if (entityIndex < hasPattern.size() && - hasPattern[entityIndex] != NO_PATTERN) { - // add the pattern - const auto& pattern = patterns[hasPattern[entityIndex]]; + // TODO This can be a much simpler and cheaper implementation that + // does a lazy scan on the specified predicate and then simply performs a + // DISTINCT on the result. + for (const auto& block : hasPattern) { + auto patternColumn = block.getColumn(1); + auto subjects = block.getColumn(0); + for (size_t i : ad_utility::integerRange(block.numRows())) { + const auto& pattern = patterns[patternColumn[i].getInt()]; for (const auto& predicate : pattern) { if (predicate == objectId) { - result.push_back( - {Id::makeFromVocabIndex(VocabIndex::make(entityIndex))}); - } - } - } else if (entityIndex < hasPredicate.size()) { - // add the relations - for (const auto& predicate : hasPredicate[entityIndex]) { - if (predicate == objectId) { - result.push_back( - {Id::makeFromVocabIndex(VocabIndex::make(entityIndex))}); + result.push_back({subjects[i]}); + break; } } } - entityIndex++; } *resultTable = std::move(result).toDynamic(); } +// ___________________________________________________________________________ void HasPredicateScan::computeFreeO( IdTable* resultTable, Id subjectAsId, - const std::vector& hasPattern, - const CompactVectorOfStrings& hasPredicate, - const CompactVectorOfStrings& patterns) { - // Subjects always have to be from the vocabulary - if (subjectAsId.getDatatype() != Datatype::VocabIndex) { - return; + const CompactVectorOfStrings& patterns) const { + auto hasPattern = getExecutionContext() + ->getIndex() + .getImpl() + .getPermutation(Permutation::Enum::PSO) + .scan(qlever::specialIds.at(HAS_PATTERN_PREDICATE), + subjectAsId, {}, cancellationHandle_); + AD_CORRECTNESS_CHECK(hasPattern.numRows() <= 1); + for (Id patternId : hasPattern.getColumn(0)) { + const auto& pattern = patterns[patternId.getInt()]; + resultTable->resize(pattern.size()); + std::ranges::copy(pattern, resultTable->getColumn(0).begin()); } - IdTableStatic<1> result = std::move(*resultTable).toStatic<1>(); - - auto subjectIndex = subjectAsId.getVocabIndex().get(); - if (subjectIndex < hasPattern.size() && - hasPattern[subjectIndex] != NO_PATTERN) { - // add the pattern - const auto& pattern = patterns[hasPattern[subjectIndex]]; - for (const auto& predicate : pattern) { - result.push_back({predicate}); - } - } else if (subjectIndex < hasPredicate.size()) { - // add the relations - for (const auto& predicate : hasPredicate[subjectIndex]) { - result.push_back({predicate}); - } - } - *resultTable = std::move(result).toDynamic(); } +// ___________________________________________________________________________ void HasPredicateScan::computeFullScan( - IdTable* resultTable, const std::vector& hasPattern, - const CompactVectorOfStrings& hasPredicate, + IdTable* resultTable, auto& hasPattern, const CompactVectorOfStrings& patterns, size_t resultSize) { IdTableStatic<2> result = std::move(*resultTable).toStatic<2>(); result.reserve(resultSize); - - uint64_t subjectIndex = 0; - while (subjectIndex < hasPattern.size() || - subjectIndex < hasPredicate.size()) { - if (subjectIndex < hasPattern.size() && - hasPattern[subjectIndex] != NO_PATTERN) { - // add the pattern - for (const auto& predicate : patterns[hasPattern[subjectIndex]]) { - result.push_back( - {Id::makeFromVocabIndex(VocabIndex::make(subjectIndex)), - predicate}); - } - } else if (subjectIndex < hasPredicate.size()) { - // add the relations - for (const auto& predicate : hasPredicate[subjectIndex]) { - result.push_back( - {Id::makeFromVocabIndex(VocabIndex::make(subjectIndex)), - predicate}); + for (const auto& block : hasPattern) { + auto patternColumn = block.getColumn(1); + auto subjects = block.getColumn(0); + for (size_t i : ad_utility::integerRange(block.numRows())) { + const auto& pattern = patterns[patternColumn[i].getInt()]; + for (const auto& predicate : pattern) { + result.push_back({subjects[i], predicate}); } } - subjectIndex++; } *resultTable = std::move(result).toDynamic(); } -template -void HasPredicateScan::computeSubqueryS( - IdTable* dynResult, const IdTable& dynInput, const size_t subtreeColIndex, - const std::vector& hasPattern, - const CompactVectorOfStrings& hasPredicate, - const CompactVectorOfStrings& patterns) { - IdTableStatic result = std::move(*dynResult).toStatic(); - const IdTableView input = dynInput.asStaticView(); - - LOG(DEBUG) << "HasPredicateScan subresult size " << input.size() << std::endl; - - for (size_t i = 0; i < input.size(); i++) { - Id subjectAsId = input(i, subtreeColIndex); - if (subjectAsId.getDatatype() != Datatype::VocabIndex) { - continue; +// ___________________________________________________________________________ +template +ResultTable HasPredicateScan::computeSubqueryS( + IdTable* dynResult, const CompactVectorOfStrings& patterns) { + auto subresult = subtree().getResult(); + auto patternCol = subtreeColIdx(); + auto result = std::move(*dynResult).toStatic(); + for (const auto& row : subresult->idTable().asStaticView()) { + const auto& pattern = patterns[row[patternCol].getInt()]; + for (auto predicate : pattern) { + result.push_back(row); + result.back()[patternCol] = predicate; } - auto subjectIndex = subjectAsId.getVocabIndex().get(); - if (subjectIndex < hasPattern.size() && - hasPattern[subjectIndex] != NO_PATTERN) { - // Expand the pattern and add it to the result - for (const auto& predicate : patterns[hasPattern[subjectIndex]]) { - result.emplace_back(); - size_t backIdx = result.size() - 1; - for (size_t k = 0; k < input.numColumns(); k++) { - result(backIdx, k) = input(i, k); - } - result(backIdx, input.numColumns()) = predicate; - } - } else if (subjectIndex < hasPredicate.size()) { - // add the relations - for (const auto& predicate : hasPredicate[subjectIndex]) { - result.emplace_back(); - size_t backIdx = result.size() - 1; - for (size_t k = 0; k < input.numColumns(); k++) { - result(backIdx, k) = input(i, k); - } - result(backIdx, input.numColumns()) = predicate; - } - } else { - break; - } - } - *dynResult = std::move(result).toDynamic(); -} - -void HasPredicateScan::setSubject(const TripleComponent& subject) { - // TODO Make the _subject and _object `Variant`. - if (subject.isString()) { - _subject = subject.getString(); - } else if (subject.isVariable()) { - _subject = subject.getVariable().name(); - } else { - throw ParseException{ - absl::StrCat("The subject of a ql:has-predicate triple must be an IRI " - "or a variable, but was \"", - subject.toString(), "\"")}; - } -} - -void HasPredicateScan::setObject(const TripleComponent& object) { - // TODO Make the _subject and _object `Variant`. - if (object.isString()) { - _object = object.getString(); - } else if (object.isVariable()) { - _object = object.getVariable().name(); - } else { - throw ParseException{ - absl::StrCat("The object of a ql:has-predicate triple must be an IRI " - "or a variable, but was \"", - object.toString(), "\"")}; } + return {std::move(result).toDynamic(), resultSortedOn(), + subresult->getSharedLocalVocab()}; } -const std::string& HasPredicateScan::getObject() const { return _object; } +// ___________________________________________________________________________ +const TripleComponent& HasPredicateScan::getObject() const { return object_; } -HasPredicateScan::ScanType HasPredicateScan::getType() const { return _type; } +// ___________________________________________________________________________ +HasPredicateScan::ScanType HasPredicateScan::getType() const { return type_; } diff --git a/src/engine/HasPredicateScan.h b/src/engine/HasPredicateScan.h index 2cd6bc9959..7ff5803b41 100644 --- a/src/engine/HasPredicateScan.h +++ b/src/engine/HasPredicateScan.h @@ -26,13 +26,29 @@ class HasPredicateScan : public Operation { SUBQUERY_S }; + struct SubtreeAndColumnIndex { + std::shared_ptr subtree_; + size_t subtreeJoinColumn_; + }; + private: - ScanType _type; - std::shared_ptr _subtree; - size_t _subtreeJoinColumn; + ScanType type_; + std::optional subtree_; + + QueryExecutionTree& subtree() { + auto* ptr = subtree_.value().subtree_.get(); + AD_CORRECTNESS_CHECK(ptr != nullptr); + return *ptr; + } + + const QueryExecutionTree& subtree() const { + return const_cast(*this).subtree(); + } + + size_t subtreeColIdx() const { return subtree_.value().subtreeJoinColumn_; } - std::string _subject; - std::string _object; + TripleComponent subject_; + TripleComponent object_; public: HasPredicateScan() = delete; @@ -40,17 +56,13 @@ class HasPredicateScan : public Operation { // TODO: The last argument should be of type `Variable`. HasPredicateScan(QueryExecutionContext* qec, std::shared_ptr subtree, - size_t subtreeJoinColumn, std::string objectVariable); + size_t subtreeJoinColumn, Variable objectVariable); HasPredicateScan(QueryExecutionContext* qec, SparqlTriple triple); private: [[nodiscard]] string getCacheKeyImpl() const override; - void setSubject(const TripleComponent& subject); - - void setObject(const TripleComponent& object); - public: [[nodiscard]] string getDescriptor() const override; @@ -73,38 +85,29 @@ class HasPredicateScan : public Operation { public: [[nodiscard]] ScanType getType() const; - [[nodiscard]] const std::string& getObject() const; + [[nodiscard]] const TripleComponent& getObject() const; vector getChildren() override { - if (_subtree) { - return {_subtree.get()}; + if (subtree_) { + return {std::addressof(subtree())}; } else { return {}; } } // These are made static and public mainly for easier testing - static void computeFreeS(IdTable* resultTable, Id objectId, - const std::vector& hasPattern, - const CompactVectorOfStrings& hasPredicate, + static void computeFreeS(IdTable* resultTable, Id objectId, auto& hasPattern, const CompactVectorOfStrings& patterns); - static void computeFreeO(IdTable* resultTable, Id subjectAsId, - const std::vector& hasPattern, - const CompactVectorOfStrings& hasPredicate, - const CompactVectorOfStrings& patterns); + void computeFreeO(IdTable* resultTable, Id subjectAsId, + const CompactVectorOfStrings& patterns) const; - static void computeFullScan(IdTable* resultTable, - const std::vector& hasPattern, - const CompactVectorOfStrings& hasPredicate, + static void computeFullScan(IdTable* resultTable, auto& hasPattern, const CompactVectorOfStrings& patterns, size_t resultSize); - template - static void computeSubqueryS(IdTable* result, const IdTable& _subtree, - size_t subtreeColIndex, - const std::vector& hasPattern, - const CompactVectorOfStrings& hasPredicate, + template + ResultTable computeSubqueryS(IdTable* result, const CompactVectorOfStrings& patterns); private: diff --git a/src/engine/IndexScan.h b/src/engine/IndexScan.h index 7e822146f6..6ca55e9f82 100644 --- a/src/engine/IndexScan.h +++ b/src/engine/IndexScan.h @@ -46,6 +46,8 @@ class IndexScan : public Operation { vector resultSortedOn() const override; + size_t numVariables() const { return numVariables_; } + void setTextLimit(size_t) override { // Do nothing. } diff --git a/src/engine/Join.h b/src/engine/Join.h index ab3f9dedcc..0e8c78b4d3 100644 --- a/src/engine/Join.h +++ b/src/engine/Join.h @@ -122,8 +122,16 @@ class Join : public Operation { ColumnIndex jc2, IdTable* dynRes); static bool isFullScanDummy(std::shared_ptr tree) { - return tree->getType() == QueryExecutionTree::SCAN && - tree->getResultWidth() == 3; + if (tree->getType() != QueryExecutionTree::SCAN) { + return false; + } + // Note: it is not sufficient to check `getResultWidth == 3` as + // the index scan might also have 2 variables + one additional column + // for the pattern trick (or any other additional column that we might add + // in the future). + const auto& scan = + dynamic_cast(*tree->getRootOperation()); + return scan.numVariables() == 3; } protected: diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index 20d2267329..62fde3a446 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -534,32 +534,28 @@ vector QueryPlanner::getPatternTrickRow( const p::SelectClause& selectClause, const vector>& dpTab, const checkUsePatternTrick::PatternTrickTuple& patternTrickTuple) { - const vector* previous = nullptr; + AD_CORRECTNESS_CHECK(!dpTab.empty()); + const vector& previous = dpTab.back(); auto aliases = selectClause.getAliases(); - if (!dpTab.empty()) { - previous = &dpTab.back(); - } + vector added; Variable predicateVariable = patternTrickTuple.predicate_; Variable countVariable = aliases.empty() ? generateUniqueVarName() : aliases[0]._target; - if (previous != nullptr && !previous->empty()) { - added.reserve(previous->size()); - for (const auto& parent : *previous) { - // Determine the column containing the subjects for which we are - // interested in their predicates. - auto subjectColumn = - parent._qet->getVariableColumn(patternTrickTuple.subject_); - auto patternTrickPlan = makeSubtreePlan( - _qec, parent._qet, subjectColumn, predicateVariable, countVariable); - added.push_back(std::move(patternTrickPlan)); - } - } else { - // Use the pattern trick without a subtree - SubtreePlan patternTrickPlan = makeSubtreePlan( - _qec, predicateVariable, countVariable); - added.push_back(std::move(patternTrickPlan)); + // Pattern tricks always contain at least one triple, otherwise something + // has gone wrong inside the `CheckUsePatternTrick` module. + AD_CORRECTNESS_CHECK(!previous.empty()); + added.reserve(previous.size()); + for (const auto& parent : previous) { + // Determine the column containing the subjects for which we are + // interested in their predicates. + // TODO Move this lookup from subjects to columns + // into the `CountAvailablePredicates` class where it belongs + auto subjectColumn = + parent._qet->getVariableColumn(patternTrickTuple.subject_); + added.push_back(makeSubtreePlan( + _qec, parent._qet, subjectColumn, predicateVariable, countVariable)); } return added; } @@ -1864,7 +1860,8 @@ auto QueryPlanner::createJoinWithHasPredicateScan( // Note that this is a new operation. auto object = static_cast( hasPredicateScanTree->getRootOperation().get()) - ->getObject(); + ->getObject() + .getVariable(); auto plan = makeSubtreePlan( qec, std::move(otherTree), otherTreeJoinColumn, std::move(object)); mergeSubtreePlanIds(plan, a, b); diff --git a/src/global/Constants.h b/src/global/Constants.h index 7584c3a028..8e643c7f97 100644 --- a/src/global/Constants.h +++ b/src/global/Constants.h @@ -194,6 +194,12 @@ static constexpr int DEFAULT_MAX_NUM_COLUMNS_STATIC_ID_TABLE = 5; // `CancellationHandle::throwIfCancelled` is called regularly. constexpr std::chrono::milliseconds DESIRED_CANCELLATION_CHECK_INTERVAL{50}; +// In the PSO and PSO permutations the patterns of the subject and object are +// stored at the following indices. Note that the col0 (the P) is not part of +// the result, so the column order for PSO is S O PatternS PatternO. +constexpr size_t ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN = 2; +constexpr size_t ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN = 3; + inline auto& RuntimeParameters() { using ad_utility::detail::parameterShortNames::Bool; using ad_utility::detail::parameterShortNames::Double; diff --git a/src/index/Index.cpp b/src/index/Index.cpp index 4aabb49e03..419320737b 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -87,16 +87,6 @@ std::pair Index::prefix_range(const std::string& prefix) const { return pimpl_->prefix_range(prefix); } -// ____________________________________________________________________________ -const vector& Index::getHasPattern() const { - return pimpl_->getHasPattern(); -} - -// ____________________________________________________________________________ -const CompactVectorOfStrings& Index::getHasPredicate() const { - return pimpl_->getHasPredicate(); -} - // ____________________________________________________________________________ const CompactVectorOfStrings& Index::getPatterns() const { return pimpl_->getPatterns(); diff --git a/src/index/Index.h b/src/index/Index.h index 1adfcc6c57..bfe56f0cd7 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -131,8 +131,6 @@ class Index { [[nodiscard]] std::pair prefix_range(const std::string& prefix) const; - [[nodiscard]] const vector& getHasPattern() const; - [[nodiscard]] const CompactVectorOfStrings& getHasPredicate() const; [[nodiscard]] const CompactVectorOfStrings& getPatterns() const; /** * @return The multiplicity of the entites column (0) of the full has-relation diff --git a/src/index/IndexFormatVersion.h b/src/index/IndexFormatVersion.h index b0dd2c7d7f..6a4afa99c1 100644 --- a/src/index/IndexFormatVersion.h +++ b/src/index/IndexFormatVersion.h @@ -36,6 +36,6 @@ struct IndexFormatVersion { // The actual index version. Change it once the binary format of the index // changes. inline const IndexFormatVersion& indexFormatVersion{ - 1031, DateOrLargeYear{Date{2023, 7, 20}}}; + 1223, DateOrLargeYear{Date{2024, 1, 18}}}; } // namespace qlever diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index d73f0a1a36..6c37326691 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -27,6 +27,7 @@ #include "util/HashMap.h" #include "util/JoinAlgorithms/JoinAlgorithms.h" #include "util/Serializer/FileSerializer.h" +#include "util/ThreadSafeQueue.h" #include "util/TupleHelpers.h" #include "util/TypeTraits.h" @@ -188,7 +189,7 @@ auto fixBlockAfterPatternJoin(auto block) { // ____________________________________________________________________________ std::unique_ptr> IndexImpl::buildOspWithPatterns( - PatternCreatorNew::TripleSorter sortersFromPatternCreator, + PatternCreator::TripleSorter sortersFromPatternCreator, auto isQleverInternalId) { auto&& [hasPatternPredicateSortedByPSO, secondSorter] = sortersFromPatternCreator; @@ -265,8 +266,8 @@ std::unique_ptr> IndexImpl::buildOspWithPatterns( // Add the `ql:has-pattern` predicate to the sorter such that it will become // part of the PSO and POS permutation. LOG(INFO) << "Adding " << hasPatternPredicateSortedByPSO->size() - << " additional triples to the POS and PSO permutation for the " - "`ql:has-pattern` predicate ..." + << "triples to the POS and PSO permutation for " + "`ql:has-pattern` ..." << std::endl; auto noPattern = Id::makeFromInt(NO_PATTERN); static_assert(NumColumnsIndexBuilding == 3); @@ -788,12 +789,14 @@ void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) { << std::endl; } + // We have to load the patterns first to figure out if the patterns were built + // at all. if (usePatterns_) { try { PatternCreator::readPatternsFromFile( onDiskBase_ + ".index.patterns", avgNumDistinctSubjectsPerPredicate_, avgNumDistinctPredicatesPerSubject_, - numDistinctSubjectPredicatePairs_, patterns_, hasPattern_); + numDistinctSubjectPredicatePairs_, patterns_); } catch (const std::exception& e) { LOG(WARN) << "Could not load the patterns. The internal predicate " "`ql:has-predicate` is therefore not available (and certain " @@ -815,18 +818,6 @@ void IndexImpl::throwExceptionIfNoPatterns() const { } } -// _____________________________________________________________________________ -const vector& IndexImpl::getHasPattern() const { - throwExceptionIfNoPatterns(); - return hasPattern_; -} - -// _____________________________________________________________________________ -const CompactVectorOfStrings& IndexImpl::getHasPredicate() const { - throwExceptionIfNoPatterns(); - return hasPredicate_; -} - // _____________________________________________________________________________ const CompactVectorOfStrings& IndexImpl::getPatterns() const { throwExceptionIfNoPatterns(); @@ -1566,30 +1557,26 @@ void IndexImpl::createPSOAndPOS(size_t numColumns, auto& isInternalId, // _____________________________________________________________________________ template requires(sizeof...(NextSorter) <= 1) -std::optional IndexImpl::createSPOAndSOP( +std::optional IndexImpl::createSPOAndSOP( size_t numColumns, auto& isInternalId, BlocksOfTriples sortedTriples, NextSorter&&... nextSorter) { size_t numSubjectsNormal = 0; auto numSubjectCounter = makeNumDistinctIdsCounter<0>(numSubjectsNormal, isInternalId); - std::optional result; + std::optional result; if (usePatterns_) { // We will return the next sorter. AD_CORRECTNESS_CHECK(sizeof...(nextSorter) == 0); // For now (especially for testing) We build the new pattern format as well // as the old one to see that they match. - PatternCreatorNew patternCreator{ - onDiskBase_ + ".index.patterns.new", + PatternCreator patternCreator{ + onDiskBase_ + ".index.patterns", memoryLimitIndexBuilding() / NUM_EXTERNAL_SORTERS_AT_SAME_TIME}; - PatternCreator patternCreatorOld{onDiskBase_ + ".index.patterns"}; - auto pushTripleToPatterns = [&patternCreator, &patternCreatorOld, + auto pushTripleToPatterns = [&patternCreator, &isInternalId](const auto& triple) { bool ignoreForPatterns = std::ranges::any_of(triple, isInternalId); auto tripleArr = std::array{triple[0], triple[1], triple[2]}; patternCreator.processTriple(tripleArr, ignoreForPatterns); - if (!ignoreForPatterns) { - patternCreatorOld.processTriple(tripleArr); - } }; createPermutationPair(numColumns, AD_FWD(sortedTriples), spo_, sop_, nextSorter.makePushCallback()..., diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 7bea6fe050..49d133d548 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -162,15 +162,6 @@ class IndexImpl { * @brief Maps pattern ids to sets of predicate ids. */ CompactVectorOfStrings patterns_; - /** - * @brief Maps entity ids to pattern ids. - */ - std::vector hasPattern_; - /** - * @brief Maps entity ids to sets of predicate ids - */ - CompactVectorOfStrings hasPredicate_; - ad_utility::AllocatorWithLimit allocator_; // TODO: make those private and allow only const access @@ -279,8 +270,6 @@ class IndexImpl { // ___________________________________________________________________________ std::pair prefix_range(const std::string& prefix) const; - const vector& getHasPattern() const; - const CompactVectorOfStrings& getHasPredicate() const; const CompactVectorOfStrings& getPatterns() const; /** * @return The multiplicity of the Entites column (0) of the full has-relation @@ -783,7 +772,7 @@ class IndexImpl { // metadata. Also builds the patterns if specified. template requires(sizeof...(NextSorter) <= 1) - std::optional createSPOAndSOP( + std::optional createSPOAndSOP( size_t numColumns, auto& isInternalId, BlocksOfTriples sortedTriples, NextSorter&&... nextSorter); // Create the OSP and OPS permutations. Additionally, count the number of @@ -826,7 +815,7 @@ class IndexImpl { // of only two permutations (where we have to build the Pxx permutations). In // all other cases the Sxx permutations are built first because we need the // patterns. - std::optional createFirstPermutationPair( + std::optional createFirstPermutationPair( auto&&... args) { static_assert(std::is_same_v); static_assert(std::is_same_v); @@ -855,6 +844,6 @@ class IndexImpl { // these five columns sorted by PSO, to be used as an input for building the // PSO and POS permutations. std::unique_ptr> buildOspWithPatterns( - PatternCreatorNew::TripleSorter sortersFromPatternCreator, + PatternCreator::TripleSorter sortersFromPatternCreator, auto isQLeverInternalId); }; diff --git a/src/index/PatternCreator.cpp b/src/index/PatternCreator.cpp index f14ea34628..c5f1991cb1 100644 --- a/src/index/PatternCreator.cpp +++ b/src/index/PatternCreator.cpp @@ -9,8 +9,8 @@ static const Id hasPatternId = qlever::specialIds.at(HAS_PATTERN_PREDICATE); // _________________________________________________________________________ -void PatternCreatorNew::processTriple(std::array triple, - bool ignoreForPatterns) { +void PatternCreator::processTriple(std::array triple, + bool ignoreForPatterns) { if (ignoreForPatterns) { tripleBuffer_.emplace_back(triple, ignoreForPatterns); return; @@ -32,8 +32,8 @@ void PatternCreatorNew::processTriple(std::array triple, } // ________________________________________________________________________________ -void PatternCreatorNew::finishSubject(VocabIndex subjectIndex, - const Pattern& pattern) { +void PatternCreator::finishSubject(VocabIndex subjectIndex, + const Pattern& pattern) { numDistinctSubjects_++; numDistinctSubjectPredicatePairs_ += pattern.size(); PatternID patternId; @@ -73,7 +73,7 @@ void PatternCreatorNew::finishSubject(VocabIndex subjectIndex, } // ____________________________________________________________________________ -void PatternCreatorNew::finish() { +void PatternCreator::finish() { if (isFinished_) { return; } @@ -109,7 +109,7 @@ void PatternCreatorNew::finish() { } // ____________________________________________________________________________ -void PatternCreatorNew::readPatternsFromFile( +void PatternCreator::readPatternsFromFile( const std::string& filename, double& avgNumSubjectsPerPredicate, double& avgNumPredicatesPerSubject, uint64_t& numDistinctSubjectPredicatePairs, @@ -132,7 +132,7 @@ void PatternCreatorNew::readPatternsFromFile( } // ____________________________________________________________________________ -void PatternCreatorNew::printStatistics( +void PatternCreator::printStatistics( PatternStatistics patternStatistics) const { LOG(INFO) << "Number of distinct patterns: " << patternToIdAndCount_.size() << std::endl; @@ -149,147 +149,3 @@ void PatternCreatorNew::printStatistics( << patternStatistics.avgNumDistinctSubjectsPerPredicate_ << std::endl; } - -// All the legacy code of the old pattern stuff. -// _________________________________________________________________________ -void PatternCreator::processTriple(std::array triple) { - if (!_currentSubjectIndex.has_value()) { - // This is the first triple - _currentSubjectIndex = triple[0].getVocabIndex(); - } else if (triple[0].getVocabIndex() != _currentSubjectIndex) { - // New subject. - finishSubject(_currentSubjectIndex.value(), _currentPattern); - _currentSubjectIndex = triple[0].getVocabIndex(); - _currentPattern.clear(); - } - // Don't list predicates twice in the same pattern. - if (_currentPattern.empty() || _currentPattern.back() != triple[1]) { - _currentPattern.push_back(triple[1]); - } -} - -// ________________________________________________________________________________ -void PatternCreator::finishSubject(VocabIndex subjectIndex, - const Pattern& pattern) { - _numDistinctSubjects++; - _numDistinctSubjectPredicatePairs += pattern.size(); - PatternID patternId; - auto it = _patternToIdAndCount.find(pattern); - if (it == _patternToIdAndCount.end()) { - // This is a new pattern, assign a new pattern ID and a count of 1. - patternId = static_cast(_patternToIdAndCount.size()); - _patternToIdAndCount[pattern] = PatternIdAndCount{patternId, 1UL}; - - // Count the total number of distinct predicates that appear in the - // pattern and have not been counted before. - for (auto predicate : pattern) { - _distinctPredicates.insert(predicate); - } - } else { - // We have already seen the same pattern for a previous subject ID, reuse - // the ID and increase the count. - patternId = it->second._patternId; - it->second._count++; - } - - // The mapping from subjects to patterns is a vector of pattern IDs. We have - // to assign the ID NO_PATTERN to all the possible subjects that have no - // triple. - while (_nextUnassignedSubjectIndex < subjectIndex) { - _subjectToPatternSerializer.push(NO_PATTERN); - _nextUnassignedSubjectIndex = _nextUnassignedSubjectIndex.incremented(); - } - - // Write the subjectIndex-pattern mapping for this subjectIndex. - _subjectToPatternSerializer.push(patternId); - _nextUnassignedSubjectIndex = _nextUnassignedSubjectIndex.incremented(); -} - -// ____________________________________________________________________________ -void PatternCreator::finish() { - if (_isFinished) { - return; - } - _isFinished = true; - - // Write the pattern of the last subject. - if (_currentSubjectIndex.has_value()) { - finishSubject(_currentSubjectIndex.value(), _currentPattern); - } - - // The mapping from subjects to patterns is already written to disk at this - // point. - _subjectToPatternSerializer.finish(); - - // Store all data in the file - ad_utility::serialization::FileWriteSerializer patternSerializer{ - std::move(_subjectToPatternSerializer).serializer()}; - - PatternStatistics patternStatistics(_numDistinctSubjectPredicatePairs, - _numDistinctSubjects, - _distinctPredicates.size()); - patternSerializer << patternStatistics; - - // Store the actual patterns ordered by their pattern ID. They are currently - // stored in a hash map, so we first have to sort them. - std::vector> orderedPatterns; - orderedPatterns.insert(orderedPatterns.end(), _patternToIdAndCount.begin(), - _patternToIdAndCount.end()); - std::sort(orderedPatterns.begin(), orderedPatterns.end(), - [](const auto& a, const auto& b) { - return a.second._patternId < b.second._patternId; - }); - CompactVectorOfStrings::Writer patternWriter{ - std::move(patternSerializer).file()}; - for (const auto& p : orderedPatterns) { - patternWriter.push(p.first.data(), p.first.size()); - } - patternWriter.finish(); - - // Print some statistics for the log of the index builder. - printStatistics(patternStatistics); -} - -// ____________________________________________________________________________ -void PatternCreator::readPatternsFromFile( - const std::string& filename, double& avgNumSubjectsPerPredicate, - double& avgNumPredicatesPerSubject, - uint64_t& numDistinctSubjectPredicatePairs, - CompactVectorOfStrings& patterns, - std::vector& subjectToPattern) { - // Read the pattern info from the patterns file. - LOG(INFO) << "Reading patterns from file " << filename << " ..." << std::endl; - - // Read the subjectToPatternMap. - ad_utility::serialization::FileReadSerializer patternReader(filename); - - // Read the statistics and the patterns. - patternReader >> subjectToPattern; - PatternStatistics statistics; - patternReader >> statistics; - patternReader >> patterns; - - numDistinctSubjectPredicatePairs = - statistics.numDistinctSubjectPredicatePairs_; - avgNumSubjectsPerPredicate = statistics.avgNumDistinctSubjectsPerPredicate_; - avgNumPredicatesPerSubject = statistics.avgNumDistinctPredicatesPerSubject_; -} - -// ____________________________________________________________________________ -void PatternCreator::printStatistics( - PatternStatistics patternStatistics) const { - LOG(INFO) << "Number of distinct patterns: " << _patternToIdAndCount.size() - << std::endl; - LOG(INFO) << "Number of subjects with pattern: " << _numDistinctSubjects - << " [all]" << std::endl; - LOG(INFO) << "Total number of distinct subject-predicate pairs: " - << _numDistinctSubjectPredicatePairs << std::endl; - LOG(INFO) << "Average number of predicates per subject: " << std::fixed - << std::setprecision(1) - << patternStatistics.avgNumDistinctPredicatesPerSubject_ - << std::endl; - LOG(INFO) << "Average number of subjects per predicate: " << std::fixed - << std::setprecision(0) - << patternStatistics.avgNumDistinctSubjectsPerPredicate_ - << std::endl; -} diff --git a/src/index/PatternCreator.h b/src/index/PatternCreator.h index fafbe9b0f4..9fb224b799 100644 --- a/src/index/PatternCreator.h +++ b/src/index/PatternCreator.h @@ -68,7 +68,7 @@ struct PatternStatistics { /// mapping from subjects to predicates (has-predicate) is not written to disk, /// but stored in a STXXL sorter which then has to be used to build an index for /// these predicates. -class PatternCreatorNew { +class PatternCreator { public: using PSOSorter = ad_utility::CompressedExternalIdTableSorter; using OSPSorter4Cols = @@ -125,8 +125,8 @@ class PatternCreatorNew { public: // The patterns will be written to files starting with `basename`. - explicit PatternCreatorNew(const string& basename, - ad_utility::MemorySize memoryLimit) + explicit PatternCreator(const string& basename, + ad_utility::MemorySize memoryLimit) : filename_{basename}, patternSerializer_{{basename}}, tripleBuffer_(100'000, basename + ".tripleBufferForPatterns.dat"), @@ -147,20 +147,20 @@ class PatternCreatorNew { // Write the patterns to disk after all triples have been pushed. Calls to // `processTriple` after calling `finish` lead to undefined behavior. Note - // that the destructor also calls `finish` to give the `PatternCreatorNew` + // that the destructor also calls `finish` to give the `PatternCreator` // proper RAII semantics. void finish(); // Destructor implicitly calls `finish`. - ~PatternCreatorNew() { + ~PatternCreator() { ad_utility::terminateIfThrows([this]() { finish(); }, "Finishing the underlying file of a " - "`PatternCreatorNew` during destruction."); + "`PatternCreator` during destruction."); } // Read the patterns from the files with the given `basename`. The patterns // must have been written to files with this `basename` using - // `PatternCreatorNew`. The patterns and all their statistics will be written + // `PatternCreator`. The patterns and all their statistics will be written // to the various arguments. static void readPatternsFromFile(const std::string& filename, double& avgNumSubjectsPerPredicate, @@ -184,87 +184,4 @@ class PatternCreatorNew { } }; -// The old version of the pattern creator. -class PatternCreator { - private: - // The file to which the patterns will be written. - std::string _filename; - - // Store the Id of a pattern, and the number of distinct subjects it occurs - // with. - struct PatternIdAndCount { - PatternID _patternId = 0; - uint64_t _count = 0; - }; - using PatternToIdAndCount = ad_utility::HashMap; - PatternToIdAndCount _patternToIdAndCount; - - // Between the calls to `processTriple` we have to remember the current - // subject (the subject of the last triple for which `processTriple` was - // called). - std::optional _currentSubjectIndex; - // The pattern of `currentSubjectIndex_`. This might still be incomplete, - // because more triples with the same subject might be pushed. - Pattern _currentPattern; - - // The lowest subject Id for which we have not yet finished and written the - // pattern. - VocabIndex _nextUnassignedSubjectIndex = VocabIndex::make(0); - - // Directly serialize the mapping from subjects to patterns to disk. - ad_utility::serialization::VectorIncrementalSerializer< - PatternID, ad_utility::serialization::FileWriteSerializer> - _subjectToPatternSerializer; - - // The predicates which have already occured in one of the patterns. Needed to - // count the number of distinct predicates. - ad_utility::HashSet _distinctPredicates; - - // The number of distinct subjects and distinct subject-predicate pairs. - uint64_t _numDistinctSubjects = 0; - uint64_t _numDistinctSubjectPredicatePairs = 0; - - // True if `finish()` was already called. - bool _isFinished = false; - - public: - // The patterns will be written to `filename` as well as to other filenames - // which have `filename` as a prefix. - explicit PatternCreator(const string& filename) - : _filename{filename}, _subjectToPatternSerializer{{filename}} { - LOG(DEBUG) << "Computing predicate patterns ..." << std::endl; - } - - // This function has to be called for all the triples in the SPO permutation - // \param triple Must be >= all previously pushed triples wrt the SPO - // permutation. - void processTriple(std::array triple); - - // Write the patterns to disk after all triples have been pushed. Calls to - // `processTriple` after calling `finish` lead to undefined behavior. Note - // that the constructor also calls `finish` to give the `PatternCreator` - // proper RAII semantics. - void finish(); - - // Destructor implicitly calls `finish` - ~PatternCreator() { - ad_utility::terminateIfThrows([this]() { finish(); }, - "Finishing the underlying file of a " - "`PatternCreator` during destruction."); - } - - // Read the patterns from `filename`. The patterns must have been written to - // this file using a `PatternCreator`. The patterns and all their statistics - // will be written to the various arguments. - static void readPatternsFromFile(const std::string& filename, - double& avgNumSubjectsPerPredicate, - double& avgNumPredicatesPerSubject, - uint64_t& numDistinctSubjectPredicatePairs, - CompactVectorOfStrings& patterns, - std::vector& subjectToPattern); - - private: - void finishSubject(VocabIndex subjectIndex, const Pattern& pattern); - void printStatistics(PatternStatistics patternStatistics) const; -}; #endif // QLEVER_PATTERNCREATOR_H diff --git a/src/parser/PropertyPath.cpp b/src/parser/PropertyPath.cpp index 0d35073a83..cdd725e0ee 100644 --- a/src/parser/PropertyPath.cpp +++ b/src/parser/PropertyPath.cpp @@ -119,10 +119,13 @@ void PropertyPath::computeCanBeNull() { // _____________________________________________________________________________ const std::string& PropertyPath::getIri() const { - AD_CONTRACT_CHECK(_operation == Operation::IRI); + AD_CONTRACT_CHECK(isIri()); return _iri; } +// _____________________________________________________________________________ +bool PropertyPath::isIri() const { return _operation == Operation::IRI; } + // _____________________________________________________________________________ std::ostream& operator<<(std::ostream& out, const PropertyPath& p) { p.writeToStream(out); diff --git a/src/parser/PropertyPath.h b/src/parser/PropertyPath.h index f88d3ead0a..afa92df228 100644 --- a/src/parser/PropertyPath.h +++ b/src/parser/PropertyPath.h @@ -105,6 +105,7 @@ class PropertyPath { // ASSERT that this property path consists of a single IRI and return that // IRI. [[nodiscard]] const std::string& getIri() const; + bool isIri() const; Operation _operation; diff --git a/src/parser/TripleComponent.cpp b/src/parser/TripleComponent.cpp index 31c87a449a..9309c2bc18 100644 --- a/src/parser/TripleComponent.cpp +++ b/src/parser/TripleComponent.cpp @@ -80,7 +80,9 @@ std::optional TripleComponent::toValueIdIfNotString() const { // ____________________________________________________________________________ std::string TripleComponent::toRdfLiteral() const { - if (isString()) { + if (isVariable()) { + return getVariable().name(); + } else if (isString()) { return getString(); } else if (isLiteral()) { return getLiteral().rawContent(); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5bd493562d..22e4ce57ed 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -225,9 +225,6 @@ addLinkAndDiscoverTest(VocabularyTest index) addLinkAndDiscoverTest(IteratorTest) -# Here we also seem to have race conditions on the tests -addLinkAndDiscoverTestSerial(PatternCreatorTest index) - # Stxxl currently always uses a file ./-stxxl.disk for all indices, which # makes it impossible to run the test cases for the Index class in parallel. # TODO fix this diff --git a/test/CheckUsePatternTrickTest.cpp b/test/CheckUsePatternTrickTest.cpp index be6f927a97..70a52e4382 100644 --- a/test/CheckUsePatternTrickTest.cpp +++ b/test/CheckUsePatternTrickTest.cpp @@ -258,27 +258,63 @@ TEST(CheckUsePatternTrick, checkUsePatternTrick) { } TEST(CheckUsePatternTrick, tripleIsCorrectlyRemoved) { - auto pq = SparqlParser::parseQuery( - "SELECT ?p WHERE {?x ql:has-predicate ?p} GROUP BY ?p"); - auto patternTrickTuple = checkUsePatternTrick::checkUsePatternTrick(&pq); - ASSERT_TRUE(patternTrickTuple.has_value()); - // The pattern trick triple has been removed from the query. - const auto& triples = std::get( - pq._rootGraphPattern._graphPatterns.at(0)) - ._triples; - ASSERT_TRUE(triples.empty()); + using namespace ::testing; + { + auto pq = SparqlParser::parseQuery( + "SELECT ?p WHERE {?x ql:has-predicate ?p} GROUP BY ?p"); + auto patternTrickTuple = checkUsePatternTrick::checkUsePatternTrick(&pq); + ASSERT_TRUE(patternTrickTuple.has_value()); + // The triple `?x ql:has-predicate ?p` has been replaced by + // `?x ql:has-pattern ?p`. + const auto& triples = std::get( + pq._rootGraphPattern._graphPatterns.at(0)) + ._triples; + ASSERT_EQ(triples.size(), 1u); + const auto& triple = triples[0]; + EXPECT_EQ(triple._s.getVariable().name(), "?x"); + EXPECT_EQ(triple._p.asString(), HAS_PATTERN_PREDICATE); + EXPECT_EQ(triple._o.getVariable().name(), "?p"); + } + + { + auto pq = SparqlParser::parseQuery( + "SELECT ?p WHERE {?x ql:has-predicate ?p . ?x ?y } GROUP BY ?p"); + auto patternTrickTuple = checkUsePatternTrick::checkUsePatternTrick(&pq); + ASSERT_TRUE(patternTrickTuple.has_value()); + // The triple `?x ql:has-predicate ?p` has been removed from the query, but + // an additional scan column for the pattern of the subject has been added + // to the `?x ?y` triple. + const auto& triples = std::get( + pq._rootGraphPattern._graphPatterns.at(0)) + ._triples; + ASSERT_EQ(triples.size(), 1u); + const auto& triple = triples[0]; + EXPECT_EQ(triple._s.getVariable().name(), "?x"); + EXPECT_EQ(triple._p.asString(), ""); + EXPECT_EQ(triple._o.getVariable().name(), "?y"); + EXPECT_THAT(triple._additionalScanColumns, + ElementsAre(std::pair{ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, + Variable{"?p"}})); + } - pq = SparqlParser::parseQuery( - "SELECT ?p WHERE {?x ql:has-predicate ?p . ?x ?y } GROUP BY ?p"); - patternTrickTuple = checkUsePatternTrick::checkUsePatternTrick(&pq); - ASSERT_TRUE(patternTrickTuple.has_value()); - // The pattern trick triple has been removed from the query., - const auto& triples2 = std::get( - pq._rootGraphPattern._graphPatterns.at(0)) - ._triples; - ASSERT_EQ(triples2.size(), 1u); - const auto& triple = triples2[0]; - ASSERT_EQ(triple._s.getVariable().name(), "?x"); - ASSERT_EQ(triple._p.asString(), ""); - ASSERT_EQ(triple._o.getVariable().name(), "?y"); + { + auto pq = SparqlParser::parseQuery( + "SELECT ?p WHERE {?x ql:has-predicate ?p . ?y ?x } GROUP BY ?p"); + auto patternTrickTuple = checkUsePatternTrick::checkUsePatternTrick(&pq); + ASSERT_TRUE(patternTrickTuple.has_value()); + // The triple `?x ql:has-predicate ?p` has been removed from the query, but + // an additional scan column for the pattern of the object has been added to + // the `?y ?x` triple. + const auto& triples = std::get( + pq._rootGraphPattern._graphPatterns.at(0)) + ._triples; + ASSERT_EQ(triples.size(), 1u); + const auto& triple = triples[0]; + EXPECT_EQ(triple._s.getVariable().name(), "?y"); + EXPECT_EQ(triple._p.asString(), ""); + EXPECT_EQ(triple._o.getVariable().name(), "?x"); + EXPECT_THAT(triple._additionalScanColumns, + ElementsAre(std::pair{ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN, + Variable{"?p"}})); + } } diff --git a/test/HasPredicateScanTest.cpp b/test/HasPredicateScanTest.cpp index bbd93d0d6d..7a3188cf3f 100644 --- a/test/HasPredicateScanTest.cpp +++ b/test/HasPredicateScanTest.cpp @@ -2,362 +2,188 @@ // Chair of Algorithms and Data Structures. // Author: Florian Kramer (florian.kramer@mail.uni-freiburg.de) +#include #include #include #include "./IndexTestHelpers.h" -#include "./util/AllocatorTestHelpers.h" +#include "./util/IdTableHelpers.h" #include "./util/IdTestHelpers.h" #include "engine/CallFixedSize.h" #include "engine/CountAvailablePredicates.h" #include "engine/HasPredicateScan.h" -#include "engine/SortPerformanceEstimator.h" #include "engine/ValuesForTesting.h" -using ad_utility::testing::makeAllocator; namespace { -auto V = ad_utility::testing::VocabId; +using ad_utility::testing::makeAllocator; auto Int = ad_utility::testing::IntId; -// used to test HasRelationScan with a subtree -auto makeDummyOperation() { - IdTable result{makeAllocator()}; - result.setNumColumns(2); - for (size_t i = 0; i < 10; i++) { - result.push_back({V(10 - i), V(2 * i)}); +// A text fixture that is used in the following. It consists of a small index +// and variables for all the IDs that appear in the index. +class HasPredicateScanTest : public ::testing::Test { + public: + using V = Variable; + std::string kg = + "

. . .

. " + ". ."; + // Mapping from subjects to distinct predicates (makes reading the test + // results easier). x -> p p2 y -> p p3 z -> p3 + QueryExecutionContext* qec = ad_utility::testing::getQec(kg); + std::function getId = + ad_utility::testing::makeGetId(qec->getIndex()); + Id x = getId(""); + Id y = getId(""); + Id z = getId(""); + Id p = getId("

"); + Id p2 = getId(""); + Id p3 = getId(""); + + // Expect that the result of the `operation` matches the `expectedElements`. + void runTest(Operation& operation, const VectorTable& expectedElements) { + auto expected = makeIdTableFromVector(expectedElements); + EXPECT_THAT(operation.getResult()->idTable(), + ::testing::ElementsAreArray(expected)); } - std::vector> vars{Variable{"?a"}, Variable{"?b"}}; - return ad_utility::makeExecutionTree( - ad_utility::testing::getQec(), std::move(result), std::move(vars)); -} -} // namespace - -TEST(HasPredicateScan, freeS) { - // Used to store the result. - IdTable idTable{makeAllocator()}; - idTable.setNumColumns(1); - // Maps entities to their patterns. If an entity id is higher than the lists - // length the hasRelation relation is used instead. - vector hasPattern = {0, NO_PATTERN, NO_PATTERN, 1, 0}; - // The has relation relation, which is used when an entity does not have a - // pattern - vector> hasRelationSrc = {{}, {V(0), V(3)}, {V(0)}, - {}, {}, {V(0), V(3)}, - {V(3), V(4)}, {V(2), V(4)}, {V(3)}}; - // Maps pattern ids to patterns - vector> patternsSrc = {{V(0), V(2), V(3)}, - {V(1), V(3), V(4), V(2), V(0)}}; - - // These are used to store the relations and patterns in contiguous blocks - // of memory. - CompactVectorOfStrings hasRelation(hasRelationSrc); - CompactVectorOfStrings patterns(patternsSrc); - // Find all entities that are in a triple with predicate 3 - HasPredicateScan::computeFreeS(&idTable, V(3), hasPattern, hasRelation, - patterns); - IdTable& result = idTable; - - // the result set does not guarantee any sorting so we have to sort manually - std::sort(result.begin(), result.end(), - [](const auto& a, const auto& b) { return a[0] < b[0]; }); + // Expect that the result of the `operation` matches the `expectedElements`, + // but without taking the order into account. + void runTestUnordered(Operation& op, const VectorTable& expectedElements) { + auto expected = makeIdTableFromVector(expectedElements); + EXPECT_THAT(op.getResult()->idTable(), + ::testing::UnorderedElementsAreArray(expected)); + } +}; +} // namespace - // three entties with a pattern and four entities without one are in the - // relation - ASSERT_EQ(7u, result.size()); - ASSERT_EQ(V(0u), result[0][0]); - ASSERT_EQ(V(1u), result[1][0]); - ASSERT_EQ(V(3u), result[2][0]); - ASSERT_EQ(V(4u), result[3][0]); - ASSERT_EQ(V(5u), result[4][0]); - ASSERT_EQ(V(6u), result[5][0]); - ASSERT_EQ(V(8u), result[6][0]); +// TODO In addition to the manual setups of the operations, we could +// also test the query setup in an E2E session by going through the +// queryPlanner. +// _____________________________________________________________ +TEST_F(HasPredicateScanTest, freeS) { + // ?x ql:has-predicate

, expected result : and + auto scan = HasPredicateScan{ + qec, SparqlTriple{Variable{"?x"}, HAS_PREDICATE_PREDICATE, "

"}}; + runTest(scan, {{x}, {y}}); } -TEST(HasPredicateScan, freeO) { - // Used to store the result. - IdTable result{makeAllocator()}; - result.setNumColumns(1); - // Maps entities to their patterns. If an entity id is higher than the lists - // length the hasRelation relation is used instead. - vector hasPattern = {0, NO_PATTERN, NO_PATTERN, 1, 0}; - // The has relation relation, which is used when an entity does not have a - // pattern - vector> hasRelationSrc = {{}, {V(0), V(3)}, {V(0)}, - {}, {}, {V(0), V(3)}, - {V(3), V(4)}, {V(2), V(4)}, {V(3)}}; - // Maps pattern ids to patterns - vector> patternsSrc = {{V(0), V(2), V(3)}, - {V(1), V(3), V(4), V(2), V(0)}}; - - // These are used to store the relations and patterns in contiguous blocks - // of memory. - CompactVectorOfStrings hasRelation(hasRelationSrc); - CompactVectorOfStrings patterns(patternsSrc); - - // Find all predicates for entity 3 (pattern 1) - HasPredicateScan::computeFreeO(&result, V(3), hasPattern, hasRelation, - patterns); - - ASSERT_EQ(5u, result.size()); - ASSERT_EQ(V(1u), result[0][0]); - ASSERT_EQ(V(3u), result[1][0]); - ASSERT_EQ(V(4u), result[2][0]); - ASSERT_EQ(V(2u), result[3][0]); - ASSERT_EQ(V(0u), result[4][0]); - - result.clear(); - - // Find all predicates for entity 6 (has-relation entry 6) - HasPredicateScan::computeFreeO(&result, V(6), hasPattern, hasRelation, - patterns); - - ASSERT_EQ(2u, result.size()); - ASSERT_EQ(V(3u), result[0][0]); - ASSERT_EQ(V(4u), result[1][0]); +// _____________________________________________________________ +TEST_F(HasPredicateScanTest, freeO) { + // ql:has-predicate ?p, expected result :

and + auto scan = HasPredicateScan{ + qec, SparqlTriple{"", HAS_PREDICATE_PREDICATE, Variable{"?p"}}}; + runTest(scan, {{p}, {p2}}); } -TEST(HasPredicateScan, fullScan) { - // Used to store the result. - IdTable result{makeAllocator()}; - result.setNumColumns(2); - // Maps entities to their patterns. If an entity id is higher than the lists - // length the hasRelation relation is used instead. - vector hasPattern = {0, NO_PATTERN, NO_PATTERN, 1, 0}; - // The has relation relation, which is used when an entity does not have a - // pattern - vector> hasRelationSrc = {{}, {V(0), V(3)}, {V(0)}, - {}, {}, {V(0), V(3)}}; - // Maps pattern ids to patterns - vector> patternsSrc = {{V(0), V(2), V(3)}, - {V(1), V(3), V(4), V(2), V(0)}}; - - // These are used to store the relations and patterns in contiguous blocks - // of memory. - CompactVectorOfStrings hasRelation(hasRelationSrc); - CompactVectorOfStrings patterns(patternsSrc); - - // Query for all relations - HasPredicateScan::computeFullScan(&result, hasPattern, hasRelation, patterns, - 16); - - ASSERT_EQ(16u, result.size()); - - // check the entity ids - ASSERT_EQ(V(0u), result[0][0]); - ASSERT_EQ(V(0u), result[1][0]); - ASSERT_EQ(V(0u), result[2][0]); - ASSERT_EQ(V(1u), result[3][0]); - ASSERT_EQ(V(1u), result[4][0]); - ASSERT_EQ(V(2u), result[5][0]); - ASSERT_EQ(V(3u), result[6][0]); - ASSERT_EQ(V(3u), result[7][0]); - ASSERT_EQ(V(3u), result[8][0]); - ASSERT_EQ(V(3u), result[9][0]); - ASSERT_EQ(V(3u), result[10][0]); - ASSERT_EQ(V(4u), result[11][0]); - ASSERT_EQ(V(4u), result[12][0]); - ASSERT_EQ(V(4u), result[13][0]); - ASSERT_EQ(V(5u), result[14][0]); - ASSERT_EQ(V(5u), result[15][0]); - - // check the predicate ids - ASSERT_EQ(V(0u), result[0][1]); - ASSERT_EQ(V(2u), result[1][1]); - ASSERT_EQ(V(3u), result[2][1]); - ASSERT_EQ(V(0u), result[3][1]); - ASSERT_EQ(V(3u), result[4][1]); - ASSERT_EQ(V(0u), result[5][1]); - ASSERT_EQ(V(1u), result[6][1]); - ASSERT_EQ(V(3u), result[7][1]); - ASSERT_EQ(V(4u), result[8][1]); - ASSERT_EQ(V(2u), result[9][1]); - ASSERT_EQ(V(0u), result[10][1]); - ASSERT_EQ(V(0u), result[11][1]); - ASSERT_EQ(V(2u), result[12][1]); - ASSERT_EQ(V(3u), result[13][1]); - ASSERT_EQ(V(0u), result[14][1]); - ASSERT_EQ(V(3u), result[15][1]); +// _____________________________________________________________ +TEST_F(HasPredicateScanTest, fullScan) { + // ?x ql:has-predicate ?y, expect the full mapping. + auto scan = HasPredicateScan{ + qec, + SparqlTriple{Variable{"?s"}, HAS_PREDICATE_PREDICATE, Variable{"?p"}}}; + runTest(scan, {{x, p}, {x, p2}, {y, p}, {y, p3}, {z, p3}}); + + // Full scans with the same variable in the subject and object are not + // supported. + auto makeIllegalScan = [this] { + return HasPredicateScan{ + qec, + SparqlTriple{Variable{"?s"}, HAS_PREDICATE_PREDICATE, Variable{"?s"}}}; + }; + AD_EXPECT_THROW_WITH_MESSAGE( + makeIllegalScan(), + ::testing::ContainsRegex( + "same variable for subject and object not supported")); + + // Triples without any variables also aren't supported currently. + auto makeIllegalScan2 = [this] { + return HasPredicateScan{ + qec, SparqlTriple{"", HAS_PREDICATE_PREDICATE, ""}}; + }; + EXPECT_ANY_THROW(makeIllegalScan2()); } -TEST(HasPredicateScan, subtreeS) { - // Used to store the result. - IdTable result{makeAllocator()}; - result.setNumColumns(3); - // Maps entities to their patterns. If an entity id is higher than the lists - // length the hasRelation relation is used instead. - vector hasPattern = {0, NO_PATTERN, NO_PATTERN, 1, 0}; - // The has relation relation, which is used when an entity does not have a - // pattern - vector> hasRelationSrc = {{}, {V(0), V(3)}, {V(0)}, - {}, {}, {V(0), V(3)}, - {V(3), V(4)}, {V(2), V(4)}, {V(3)}}; - // Maps pattern ids to patterns - vector> patternsSrc = {{V(0), V(2), V(3)}, - {V(1), V(3), V(4), V(2), V(0)}}; - - // These are used to store the relations and patterns in contiguous blocks - // of memory. - CompactVectorOfStrings hasRelation(hasRelationSrc); - CompactVectorOfStrings patterns(patternsSrc); - - Index index{ad_utility::makeUnlimitedAllocator()}; - QueryResultCache cache{}; - QueryExecutionContext ctx(index, &cache, makeAllocator(), - SortPerformanceEstimator{}); - - // create the subtree operation - std::shared_ptr subtree = makeDummyOperation(); - - std::shared_ptr subresult = subtree->getResult(); - int in_width = 2; - int out_width = 3; - CALL_FIXED_SIZE((std::array{in_width, out_width}), - HasPredicateScan::computeSubqueryS, &result, - subresult->idTable(), 1, hasPattern, hasRelation, patterns); - - // the sum of the count of every second entities relations - ASSERT_EQ(10u, result.size()); - - // check for the first column - - // check for the entity ids - ASSERT_EQ(V(10u), result[0][0]); - ASSERT_EQ(V(10u), result[1][0]); - ASSERT_EQ(V(10u), result[2][0]); - ASSERT_EQ(V(9u), result[3][0]); - ASSERT_EQ(V(8u), result[4][0]); - ASSERT_EQ(V(8u), result[5][0]); - ASSERT_EQ(V(8u), result[6][0]); - ASSERT_EQ(V(7u), result[7][0]); - ASSERT_EQ(V(7u), result[8][0]); - ASSERT_EQ(V(6u), result[9][0]); - - // check for the entity ids - ASSERT_EQ(V(0u), result[0][1]); - ASSERT_EQ(V(0u), result[1][1]); - ASSERT_EQ(V(0u), result[2][1]); - ASSERT_EQ(V(2u), result[3][1]); - ASSERT_EQ(V(4u), result[4][1]); - ASSERT_EQ(V(4u), result[5][1]); - ASSERT_EQ(V(4u), result[6][1]); - ASSERT_EQ(V(6u), result[7][1]); - ASSERT_EQ(V(6u), result[8][1]); - ASSERT_EQ(V(8u), result[9][1]); - - // check for the predicate ids - ASSERT_EQ(V(0u), result[0][2]); - ASSERT_EQ(V(2u), result[1][2]); - ASSERT_EQ(V(3u), result[2][2]); - ASSERT_EQ(V(0u), result[3][2]); - ASSERT_EQ(V(0u), result[4][2]); - ASSERT_EQ(V(2u), result[5][2]); - ASSERT_EQ(V(3u), result[6][2]); - ASSERT_EQ(V(3u), result[7][2]); - ASSERT_EQ(V(4u), result[8][2]); - ASSERT_EQ(V(3u), result[9][2]); +// _____________________________________________________________ +TEST_F(HasPredicateScanTest, subtree) { + // ?x ?y . ?x ql:has-predicate ?predicate. + // The first triple matches only ` `, so we get the pattern + // for `y` with an additional column that always is `( + qec, Permutation::Enum::OPS, SparqlTriple{V{"?x"}, "?y", ""}); + auto scan = HasPredicateScan{qec, indexScan, 1, V{"?predicate"}}; + runTest(scan, {{p3, y, p}, {p3, y, p3}}); } -TEST(CountAvailablePredicates, patternTrickTest) { - // The input table containing entity ids - IdTable input(1, makeAllocator()); - for (uint64_t i = 0; i < 8; i++) { - input.push_back({V(i)}); - } - // Used to store the result. - IdTable result(2, makeAllocator()); - // Maps entities to their patterns. If an entity id is higher than the lists - // length the hasRelation relation is used instead. - vector hasPattern = {0, NO_PATTERN, NO_PATTERN, 1, 0}; - // The has relation relation, which is used when an entity does not have a - // pattern - vector> hasRelationSrc = {{}, {V(0), V(3)}, {V(0)}, - {}, {}, {V(0), V(3)}, - {V(3), V(4)}, {V(2), V(4)}, {V(3)}}; - // Maps pattern ids to patterns - vector> patternsSrc = {{V(0), V(2), V(3)}, - {V(1), V(3), V(4), V(2), V(0)}}; - - // These are used to store the relations and patterns in contiguous blocks - // of memory. - CompactVectorOfStrings hasRelation(hasRelationSrc); - CompactVectorOfStrings patterns(patternsSrc); - - RuntimeInformation runtimeInfo{}; - try { - CALL_FIXED_SIZE(input.numColumns(), - CountAvailablePredicates::computePatternTrick, input, - &result, hasPattern, hasRelation, patterns, 0, runtimeInfo); - } catch (const std::runtime_error& e) { - // More verbose output in the case of an exception occuring. - std::cout << e.what() << std::endl; - ASSERT_TRUE(false); - } - - std::sort( - result.begin(), result.end(), - [](const auto& i1, const auto& i2) -> bool { return i1[0] < i2[0]; }); - ASSERT_EQ(5u, result.size()); - - ASSERT_EQ(V(0u), result(0, 0)); - ASSERT_EQ(Int(6u), result(0, 1)); - - ASSERT_EQ(V(1u), result(1, 0)); - ASSERT_EQ(Int(1u), result(1, 1)); - - ASSERT_EQ(V(2u), result(2, 0)); - ASSERT_EQ(Int(4u), result(2, 1)); - - ASSERT_EQ(V(3u), result(3, 0)); - ASSERT_EQ(Int(6u), result(3, 1)); - - ASSERT_EQ(V(4u), result(4, 0)); - ASSERT_EQ(Int(3u), result(4, 1)); - - // ASSERT_EQ(0u, result[0][0]); - // ASSERT_EQ(5u, result[0][1]); - // - // ASSERT_EQ(1u, result[1][0]); - // ASSERT_EQ(1u, result[1][1]); - // - // ASSERT_EQ(2u, result[2][0]); - // ASSERT_EQ(4u, result[2][1]); - // - // ASSERT_EQ(3u, result[3][0]); - // ASSERT_EQ(5u, result[3][1]); - // - // ASSERT_EQ(4u, result[4][0]); - // ASSERT_EQ(3u, result[4][1]); - - // Test the pattern trick for all entities - result.clear(); - try { - CountAvailablePredicates::computePatternTrickAllEntities( - &result, hasPattern, hasRelation, patterns); - } catch (const std::runtime_error& e) { - // More verbose output in the case of an exception occuring. - std::cout << e.what() << std::endl; - ASSERT_TRUE(false); - } - std::sort( - result.begin(), result.end(), - [](const auto& i1, const auto& i2) -> bool { return i1[0] < i2[0]; }); - - ASSERT_EQ(5u, result.size()); - - ASSERT_EQ(V(0u), result[0][0]); - ASSERT_EQ(Int(6u), result[0][1]); - - ASSERT_EQ(V(1u), result[1][0]); - ASSERT_EQ(Int(1u), result[1][1]); +// ____________________________________________________________ +TEST_F(HasPredicateScanTest, patternTrickWithSubtree) { + /* Manual setup of the operations for the following pattern trick + * query: + * SELECT ?predicate COUNT(DISTINCT ?x) WHERE { + * ?x ?y. + * ?x ?predicate ?o + * } GROUP BY ?predicate + */ + auto triple = SparqlTriple{V{"?x"}, "", V{"?y"}}; + triple._additionalScanColumns.emplace_back( + ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, V{"?predicate"}); + auto indexScan = ad_utility::makeExecutionTree( + qec, Permutation::Enum::PSO, triple); + auto patternTrick = + CountAvailablePredicates(qec, indexScan, 1, V{"?predicate"}, V{"?count"}); + + runTestUnordered(patternTrick, {{p3, Int(2)}, {p, Int(1)}}); +} - ASSERT_EQ(V(2u), result[2][0]); - ASSERT_EQ(Int(4u), result[2][1]); +// ____________________________________________________________ +TEST_F(HasPredicateScanTest, patternTrickWithSubtreeTwoFixedElements) { + /* Manual setup of the operations for the following pattern trick + * query (not so different, but increases the test coverage): + * SELECT ?predicate COUNT(DISTINCT ?x) WHERE { + * ?x . + * ?x ?predicate ?o + * } GROUP BY ?predicate + */ + auto triple = SparqlTriple{V{"?x"}, "", ""}; + triple._additionalScanColumns.emplace_back( + ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, Variable{"?predicate"}); + auto indexScan = ad_utility::makeExecutionTree( + qec, Permutation::Enum::POS, triple); + auto patternTrick = + CountAvailablePredicates(qec, indexScan, 0, V{"?predicate"}, V{"?count"}); + + runTestUnordered(patternTrick, {{p3, Int(1)}, {p, Int(1)}}); +} - ASSERT_EQ(V(3u), result[3][0]); - ASSERT_EQ(Int(7u), result[3][1]); +// ____________________________________________________________ +TEST_F(HasPredicateScanTest, patternTrickIllegalInput) { + auto I = ad_utility::testing::IntId; + auto Voc = ad_utility::testing::VocabId; + // The subtree of the `CountAvailablePredicates` is illegal, because the + // pattern index column contains the entry `273` which is neither `NO_PATTERN` + // nor a valid pattern index. + auto illegalInput = + makeIdTableFromVector({{Voc(0), I(273)}, {Voc(1), I(NO_PATTERN)}}); + auto subtree = ad_utility::makeExecutionTree( + qec, std::move(illegalInput), + std::vector>{V{"?x"}, V{"?predicate"}}); + + auto patternTrick = + CountAvailablePredicates(qec, subtree, 1, V{"?predicate"}, V{"?count"}); + EXPECT_ANY_THROW(runTestUnordered(patternTrick, {{p3, Int(2)}, {p, Int(1)}})); +} - ASSERT_EQ(V(4u), result[4][0]); - ASSERT_EQ(Int(3u), result[4][1]); +// ____________________________________________________________ +TEST_F(HasPredicateScanTest, patternTrickAllEntities) { + /* Manual setup of the operations for the full pattern trick: + * SELECT ?predicate COUNT(DISTINCT ?x) WHERE { + * ?x ?predicate ?o + * } GROUP BY ?predicate + */ + auto triple = SparqlTriple{V{"?x"}, HAS_PATTERN_PREDICATE, V{"?predicate"}}; + auto indexScan = ad_utility::makeExecutionTree( + qec, Permutation::Enum::PSO, triple); + auto patternTrick = + CountAvailablePredicates(qec, indexScan, 0, V{"?predicate"}, V{"?count"}); + + runTestUnordered(patternTrick, {{p3, Int(2)}, {p2, Int(1)}, {p, Int(2)}}); } diff --git a/test/IndexTest.cpp b/test/IndexTest.cpp index 8c0159ff91..a4dc459999 100644 --- a/test/IndexTest.cpp +++ b/test/IndexTest.cpp @@ -186,40 +186,6 @@ TEST(IndexTest, createFromTurtleTest) { runTest(false, true); } -TEST(CreatePatterns, createPatterns) { - { - std::string kb = - " .\n" - " .\n" - " .\n" - " .\n" - " ."; - - const Index& indexNoImpl = getQec(kb)->getIndex(); - const IndexImpl& index = indexNoImpl.getImpl(); - - ASSERT_EQ(2u, index.getHasPattern().size()); - ASSERT_EQ(0u, index.getHasPredicate().size()); - auto getId = ad_utility::testing::makeGetId(indexNoImpl); - // Pattern p0 (for subject ) consists of and p0{getId(""), getId("")}; - // Pattern p1 (for subject ) consists of and ) - std::vector p1{getId(""), getId("")}; - - auto checkPattern = [&index](const auto& expected, Id subject) { - PatternID patternIdx = - index.getHasPattern()[subject.getVocabIndex().get()]; - const auto& actual = index.getPatterns()[patternIdx]; - for (size_t i = 0; i < actual.size(); i++) { - ASSERT_EQ(expected[i], actual[i]); - } - }; - - checkPattern(p0, getId("")); - checkPattern(p1, getId("")); - } -} - TEST(IndexTest, createFromOnDiskIndexTest) { std::string kb = " .\n" diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp index 29b6d07eda..7a535f500f 100644 --- a/test/LocalVocabTest.cpp +++ b/test/LocalVocabTest.cpp @@ -299,7 +299,7 @@ TEST(LocalVocab, propagation) { checkLocalVocab(transitivePath, std::vector{"x", "y1", "y2"}); // PATTERN TRICK operations. - HasPredicateScan hasPredicateScan(testQec, qet(values1), 0, "?z"); + HasPredicateScan hasPredicateScan(testQec, qet(values1), 0, Variable{"?z"}); checkLocalVocab(hasPredicateScan, std::vector{"x", "y1", "y2"}); CountAvailablePredicates countAvailablePredictes( testQec, qet(values1), 0, Variable{"?x"}, Variable{"?y"}); diff --git a/test/PatternCreatorTest.cpp b/test/PatternCreatorTest.cpp deleted file mode 100644 index 76021e18a9..0000000000 --- a/test/PatternCreatorTest.cpp +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright 2022, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach - -#include - -#include "./util/IdTestHelpers.h" -#include "index/PatternCreator.h" -#include "util/Serializer/ByteBufferSerializer.h" -#include "util/Serializer/Serializer.h" - -namespace { -auto V = ad_utility::testing::VocabId; -} - -TEST(PatternStatistics, Initialization) { - PatternStatistics patternStatistics{50, 25, 4}; - ASSERT_EQ(patternStatistics.numDistinctSubjectPredicatePairs_, 50u); - ASSERT_FLOAT_EQ(patternStatistics.avgNumDistinctPredicatesPerSubject_, 2.0); - ASSERT_FLOAT_EQ(patternStatistics.avgNumDistinctSubjectsPerPredicate_, 12.5); -} - -TEST(PatternStatistics, Serialization) { - PatternStatistics patternStatistics{50, 25, 4}; - ad_utility::serialization::ByteBufferWriteSerializer writer; - writer << patternStatistics; - ad_utility::serialization::ByteBufferReadSerializer reader{ - std::move(writer).data()}; - - PatternStatistics statistics2; - reader >> statistics2; - - ASSERT_EQ(statistics2.numDistinctSubjectPredicatePairs_, 50u); - ASSERT_FLOAT_EQ(statistics2.avgNumDistinctPredicatesPerSubject_, 2.0); - ASSERT_FLOAT_EQ(statistics2.avgNumDistinctSubjectsPerPredicate_, 12.5); -} - -// Create patterns from a small SPO-sorted sequence of triples. -void createExamplePatterns(PatternCreator& creator) { - creator.processTriple({V(0), V(10), V(20)}); - creator.processTriple({V(0), V(10), V(21)}); - creator.processTriple({V(0), V(11), V(18)}); - creator.processTriple({V(1), V(10), V(18)}); - creator.processTriple({V(1), V(12), V(18)}); - creator.processTriple({V(1), V(13), V(18)}); - creator.processTriple({V(3), V(10), V(28)}); - creator.processTriple({V(3), V(11), V(29)}); - creator.processTriple({V(3), V(11), V(45)}); -} - -// Assert that the contents of patterns read from `filename` match the triples -// from the `createExamplePatterns` function. -void assertPatternContents(const std::string& filename) { - double averageNumSubjectsPerPredicate; - double averageNumPredicatesPerSubject; - uint64_t numDistinctSubjectPredicatePairs; - CompactVectorOfStrings patterns; - std::vector subjectToPattern; - - PatternCreator::readPatternsFromFile( - filename, averageNumSubjectsPerPredicate, averageNumPredicatesPerSubject, - numDistinctSubjectPredicatePairs, patterns, subjectToPattern); - - ASSERT_EQ(numDistinctSubjectPredicatePairs, 7); - ASSERT_FLOAT_EQ(averageNumPredicatesPerSubject, 7.0 / 3.0); - ASSERT_FLOAT_EQ(averageNumSubjectsPerPredicate, 7.0 / 4.0); - - // We have two patterns: (10, 11) and (10, 12, 13). - ASSERT_EQ(patterns.size(), 2); - - ASSERT_EQ(patterns[0].size(), 2); - ASSERT_EQ(patterns[0][0], V(10)); - ASSERT_EQ(patterns[0][1], V(11)); - - ASSERT_EQ(patterns[1].size(), 3); - ASSERT_EQ(patterns[1][0], V(10)); - ASSERT_EQ(patterns[1][1], V(12)); - ASSERT_EQ(patterns[1][2], V(13)); - - // We have 4 subjects 0, 1, 2, 3. Subject 2 has no pattern, because - // it has no triples. Subjects 0 and 3 have the first pattern, subject 1 has - // the second pattern. - - ASSERT_EQ(subjectToPattern.size(), 4); - ASSERT_EQ(0, subjectToPattern[0]); - ASSERT_EQ(1, subjectToPattern[1]); - ASSERT_EQ(NO_PATTERN, subjectToPattern[2]); - ASSERT_EQ(0, subjectToPattern[3]); -} - -TEST(PatternCreator, writeAndReadWithFinish) { - std::string filename = "patternCreator.test.tmp"; - PatternCreator creator{filename}; - createExamplePatterns(creator); - creator.finish(); - - assertPatternContents(filename); - ad_utility::deleteFile(filename); -} - -TEST(PatternCreator, writeAndReadWithDestructor) { - std::string filename = "patternCreator.test.tmp"; - { - PatternCreator creator{filename}; - createExamplePatterns(creator); - // The destructor of `creator` at the following `} automatically runs - // `creator.finish()` - } - - assertPatternContents(filename); - ad_utility::deleteFile(filename); -} - -TEST(PatternCreator, writeAndReadWithDestructorAndFinish) { - std::string filename = "patternCreator.test.tmp"; - { - PatternCreator creator{filename}; - createExamplePatterns(creator); - creator.finish(); - // The destructor of `creator` at the following `}` does not run - // `creator.finish()` because it has already been manually called. - } - - assertPatternContents(filename); - ad_utility::deleteFile(filename); -} diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index fdb549a733..1db0fbc10f 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -875,3 +875,20 @@ TEST(QueryPlannerTest, TooManyTriples) { qp.createExecutionTree(pq), ::testing::ContainsRegex("At most 64 triples allowed at the moment.")); } + +// ___________________________________________________________________________ +TEST(QueryPlanner, CountAvailablePredicates) { + h::expect( + "SELECT ?p (COUNT(DISTINCT ?s) as ?cnt) WHERE { ?s ?p ?o} GROUP BY ?p", + h::CountAvailablePredicates( + 0, Var{"?p"}, Var{"?cnt"}, + h::IndexScanFromStrings("?s", HAS_PATTERN_PREDICATE, "?p"))); + h::expect( + "SELECT ?p (COUNT(DISTINCT ?s) as ?cnt) WHERE { ?s ql:has-predicate ?p} " + "GROUP BY ?p", + h::CountAvailablePredicates( + 0, Var{"?p"}, Var{"?cnt"}, + h::IndexScanFromStrings("?s", HAS_PATTERN_PREDICATE, "?p"))); + // TODO Add a test for the case with subtrees with and without + // rewriting of triples. +} diff --git a/test/QueryPlannerTestHelpers.h b/test/QueryPlannerTestHelpers.h index 0b785e69e7..477814b2e9 100644 --- a/test/QueryPlannerTestHelpers.h +++ b/test/QueryPlannerTestHelpers.h @@ -8,6 +8,7 @@ #include "./util/GTestHelpers.h" #include "engine/Bind.h" #include "engine/CartesianProductJoin.h" +#include "engine/CountAvailablePredicates.h" #include "engine/IndexScan.h" #include "engine/Join.h" #include "engine/MultiColumnJoin.h" @@ -141,6 +142,23 @@ inline auto NeutralElementOperation = []() { An()); }; +// Matcher for a `CountAvailablePredicates` operation. The case of 0 children +// means that it's a full scan. +inline auto CountAvailablePredicates = + [](size_t subjectColumnIdx, const Variable& predicateVar, + const Variable& countVar, + const std::same_as auto&... childMatchers) + requires(sizeof...(childMatchers) <= 1) { + return RootOperation<::CountAvailablePredicates>(AllOf( + AD_PROPERTY(::CountAvailablePredicates, subjectColumnIndex, + Eq(subjectColumnIdx)), + AD_PROPERTY(::CountAvailablePredicates, predicateVariable, + Eq(predicateVar)), + AD_PROPERTY(::CountAvailablePredicates, countVariable, Eq(countVar)), + AD_PROPERTY(Operation, getChildren, + ElementsAre(Pointee(childMatchers)...)))); +}; + // Same as above, but the subject, predicate, and object are passed in as // strings. The strings are automatically converted a matching // `TripleComponent`. diff --git a/test/index/CMakeLists.txt b/test/index/CMakeLists.txt index b5eaf88cce..0f5e91e139 100644 --- a/test/index/CMakeLists.txt +++ b/test/index/CMakeLists.txt @@ -1 +1 @@ -addLinkAndDiscoverTest(PatternCreatorNewTest index) +addLinkAndDiscoverTest(PatternCreatorTest index) diff --git a/test/index/PatternCreatorNewTest.cpp b/test/index/PatternCreatorTest.cpp similarity index 94% rename from test/index/PatternCreatorNewTest.cpp rename to test/index/PatternCreatorTest.cpp index 5064cfcf0f..62858919bc 100644 --- a/test/index/PatternCreatorNewTest.cpp +++ b/test/index/PatternCreatorTest.cpp @@ -22,7 +22,7 @@ using TripleVec = std::vector>; static const Id hasPatternId = qlever::specialIds.at(HAS_PATTERN_PREDICATE); // Convert a PSOSorter to a vector of triples for easier handling -TripleVec getVectorFromSorter(PatternCreatorNew::PSOSorter&& sorter) { +TripleVec getVectorFromSorter(PatternCreator::PSOSorter&& sorter) { TripleVec triples; for (const auto& triple : sorter.sortedView()) { triples.push_back(static_cast>(triple)); @@ -33,14 +33,14 @@ TripleVec getVectorFromSorter(PatternCreatorNew::PSOSorter&& sorter) { using ad_utility::source_location; } // namespace -TEST(PatternStatisticsNew, Initialization) { +TEST(PatternStatistics, Initialization) { PatternStatistics patternStatistics{50, 25, 4}; ASSERT_EQ(patternStatistics.numDistinctSubjectPredicatePairs_, 50u); ASSERT_FLOAT_EQ(patternStatistics.avgNumDistinctPredicatesPerSubject_, 2.0); ASSERT_FLOAT_EQ(patternStatistics.avgNumDistinctSubjectsPerPredicate_, 12.5); } -TEST(PatternStatisticsNew, Serialization) { +TEST(PatternStatistics, Serialization) { PatternStatistics patternStatistics{50, 25, 4}; ad_utility::serialization::ByteBufferWriteSerializer writer; writer << patternStatistics; @@ -56,7 +56,7 @@ TEST(PatternStatisticsNew, Serialization) { } // Create patterns from a small SPO-sorted sequence of triples. -auto createExamplePatterns(PatternCreatorNew& creator) { +auto createExamplePatterns(PatternCreator& creator) { using A = std::array; std::vector expected; @@ -116,7 +116,7 @@ void assertPatternContents(const std::string& filename, uint64_t numDistinctSubjectPredicatePairs; CompactVectorOfStrings patterns; - PatternCreatorNew::readPatternsFromFile( + PatternCreator::readPatternsFromFile( filename, averageNumSubjectsPerPredicate, averageNumPredicatesPerSubject, numDistinctSubjectPredicatePairs, patterns); @@ -149,9 +149,9 @@ void assertPatternContents(const std::string& filename, EXPECT_THAT(addedTriples, ::testing::ElementsAreArray(expectedTriples)); } -TEST(PatternCreatorNew, writeAndReadWithFinish) { +TEST(PatternCreator, writeAndReadWithFinish) { std::string filename = "patternCreator.test.tmp"; - PatternCreatorNew creator{filename, memForStxxl}; + PatternCreator creator{filename, memForStxxl}; auto hashPatternAsPSOPtr = createExamplePatterns(creator); creator.finish(); diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index 4bff3a5137..060c292b04 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -24,7 +24,9 @@ std::vector getAllIndexFilenames( const std::string& indexBasename) { return {indexBasename + ".ttl", indexBasename + ".index.pos", + indexBasename + ".index.pos.meta", indexBasename + ".index.pso", + indexBasename + ".index.pso.meta", indexBasename + ".index.sop", indexBasename + ".index.sop.meta", indexBasename + ".index.spo", @@ -42,25 +44,27 @@ std::vector getAllIndexFilenames( } namespace { -// Check that the old pattern implementation (separate patterns in separate -// files) have exactly the same contents as the patterns that are folded into -// the PSO and POS permutation. -void checkConsistencyBetweenOldAndNewPatterns(const Index& index) { +// Check that the patterns as stored in the `ql:has-pattern` relation in the PSO +// and POS permutations have exactly the same contents as the patterns that are +// folded into the permutations as additional columns. +void checkConsistencyBetweenPatternPredicateAndAdditionalColumn( + const Index& index) { static constexpr size_t col0IdTag = 43; - auto checkSingleElement = [](const Index& index, size_t patternIdx, Id id) { - const auto& hasPattern = index.getHasPattern(); - auto expectedPattern = [&] { - if (id.getDatatype() != Datatype::VocabIndex) { - return NO_PATTERN; - } - auto idx = id.getVocabIndex().get(); - if (idx >= hasPattern.size()) { - return NO_PATTERN; - } - return hasPattern[idx]; - }(); - EXPECT_EQ(patternIdx, expectedPattern) - << id << ' ' << index.getHasPattern().size() << ' ' << NO_PATTERN; + auto cancellationDummy = std::make_shared>(); + auto hasPatternId = qlever::specialIds.at(HAS_PATTERN_PREDICATE); + auto checkSingleElement = [&cancellationDummy, &hasPatternId]( + const Index& index, size_t patternIdx, Id id) { + auto scanResultHasPattern = index.scan( + hasPatternId, id, Permutation::Enum::PSO, {}, cancellationDummy); + // Each ID has at most one pattern, it can have none if it doesn't + // appear as a subject in the knowledge graph. + AD_CORRECTNESS_CHECK(scanResultHasPattern.numRows() <= 1); + if (scanResultHasPattern.numRows() == 0) { + EXPECT_EQ(patternIdx, NO_PATTERN) << id << ' ' << NO_PATTERN; + } else { + auto actualPattern = scanResultHasPattern(0, 0).getInt(); + EXPECT_EQ(patternIdx, actualPattern) << id << ' ' << actualPattern; + } }; auto checkConsistencyForCol0IdAndPermutation = @@ -68,29 +72,20 @@ void checkConsistencyBetweenOldAndNewPatterns(const Index& index) { size_t objectColIdx) { auto cancellationDummy = std::make_shared>(); - auto scanResult = index.scan(col0Id, std::nullopt, permutation, - std::array{ColumnIndex{2}, ColumnIndex{3}}, - cancellationDummy); - auto hasPatternId = qlever::specialIds.at(HAS_PATTERN_PREDICATE); - auto scanResultHasPattern = - index.scan(hasPatternId, col0Id, Permutation::Enum::PSO, {}, - cancellationDummy); - // Each ID has at most one pattern, it can have none if it doesn't - // appear as a subject in the knowledge graph. - AD_CORRECTNESS_CHECK(scanResultHasPattern.numRows() <= 1); - if (scanResultHasPattern.numRows() == 0) { - checkSingleElement(index, NO_PATTERN, col0Id); - } else { - checkSingleElement(index, scanResultHasPattern(0, 0).getInt(), - col0Id); - } + auto scanResult = index.scan( + col0Id, std::nullopt, permutation, + std::array{ColumnIndex{ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN}, + ColumnIndex{ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}}, + cancellationDummy); ASSERT_EQ(scanResult.numColumns(), 4u); for (const auto& row : scanResult) { - auto patternIdx = row[2].getInt(); + auto patternIdx = + row[ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN].getInt(); Id subjectId = row[subjectColIdx]; checkSingleElement(index, patternIdx, subjectId); Id objectId = objectColIdx == col0IdTag ? col0Id : row[objectColIdx]; - auto patternIdxObject = row[3].getInt(); + auto patternIdxObject = + row[ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN].getInt(); checkSingleElement(index, patternIdxObject, objectId); } }; @@ -182,7 +177,7 @@ Index makeTestIndex(const std::string& indexBasename, ad_utility::setGlobalLoggingStream(&std::cout); if (usePatterns && loadAllPermutations) { - checkConsistencyBetweenOldAndNewPatterns(index); + checkConsistencyBetweenPatternPredicateAndAdditionalColumn(index); } return index; }