Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code for locating triples in an existing index #1000

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/global/IdTriple.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// Copyright 2024, University of Freiburg
// Chair of Algorithms and Data Structures
// Authors: Hannah Bast <[email protected]>

#pragma once

#include <array>

#include "global/Id.h"

// Should we have an own class for this? We need this at several places.
using IdTriple = std::array<Id, 3>;
4 changes: 2 additions & 2 deletions src/global/ValueId.h
Original file line number Diff line number Diff line change
Expand Up @@ -311,10 +311,10 @@ class ValueId {
/// This operator is only for debugging and testing. It returns a
/// human-readable representation.
friend std::ostream& operator<<(std::ostream& ostr, const ValueId& id) {
ostr << toString(id.getDatatype()) << ':';
ostr << toString(id.getDatatype())[0] << ':';
auto visitor = [&ostr]<typename T>(T&& value) {
if constexpr (ad_utility::isSimilar<T, ValueId::UndefinedType>) {
ostr << "Undefined";
ostr << "xx";
} else if constexpr (ad_utility::isSimilar<T, double> ||
ad_utility::isSimilar<T, int64_t>) {
ostr << std::to_string(value);
Expand Down
2 changes: 1 addition & 1 deletion src/index/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ add_subdirectory(vocabulary)
add_library(index
Index.cpp IndexImpl.cpp IndexImpl.Text.cpp
Vocabulary.cpp VocabularyOnDisk.cpp
Permutation.cpp TextMetaData.cpp
LocatedTriples.cpp Permutation.cpp TextMetaData.cpp
DocsDB.cpp FTSAlgorithms.cpp
PrefixHeuristic.cpp CompressedRelation.cpp
PatternCreator.cpp)
Expand Down
4 changes: 3 additions & 1 deletion src/index/CompressedRelation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1073,7 +1073,9 @@ CompressedRelationWriter::createPermutationPair(
}
inputWaitTimer.cont();
for (auto& block : AD_FWD(sortedTriples)) {
AD_CORRECTNESS_CHECK(block.numColumns() == numColumns + 1);
AD_CORRECTNESS_CHECK(block.numColumns() == numColumns + 1,
"block.numColumns() = ", block.numColumns(),
", numColumns = ", numColumns);
inputWaitTimer.stop();
// This only happens when the index is completely empty.
if (block.empty()) {
Expand Down
1 change: 1 addition & 0 deletions src/index/CompressedRelation.h
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,7 @@ class CompressedRelationReader {
static void decompressColumn(const std::vector<char>& compressedColumn,
size_t numRowsToRead, Iterator iterator);

public:
// Read the block that is identified by the `blockMetaData` from the `file`,
// decompress and return it. Only the columns specified by the `columnIndices`
// are returned.
Expand Down
15 changes: 13 additions & 2 deletions src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -740,10 +740,12 @@ IndexImpl::createPermutations(size_t numColumns, auto&& sortedTriples,
}

// ________________________________________________________________________
void IndexImpl::createPermutationPair(size_t numColumns, auto&& sortedTriples,
template <typename SortedTriplesType, typename... CallbackTypes>
void IndexImpl::createPermutationPair(size_t numColumns,
SortedTriplesType&& sortedTriples,
const Permutation& p1,
const Permutation& p2,
auto&&... perTripleCallbacks) {
CallbackTypes&&... perTripleCallbacks) {
auto [metaData1, metaData2] = createPermutations(
numColumns, AD_FWD(sortedTriples), p1, p2, AD_FWD(perTripleCallbacks)...);
// Set the name of this newly created pair of `IndexMetaData` objects.
Expand All @@ -762,6 +764,15 @@ void IndexImpl::createPermutationPair(size_t numColumns, auto&& sortedTriples,
writeMetadata(metaData2, p2);
}

// Explicit instantiation needed for `test/LocatedTripleTest.cpp`.
//
// TODO: Do we really need to make `SortedTriplesType` a template parameter (or
// `auto&&` as it was before)? To me it looks like this function (and others of
// its kind) are always called with `sortedTriples` of type `BlocksOfTriples`.
template void IndexImpl::createPermutationPair<IndexImpl::BlocksOfTriples>(
size_t, IndexImpl::BlocksOfTriples&&, const Permutation&,
const Permutation&);

// _____________________________________________________________________________
void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) {
setOnDiskBase(onDiskBase);
Expand Down
113 changes: 62 additions & 51 deletions src/index/IndexImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -492,30 +492,38 @@ class IndexImpl {
std::array<size_t, 3> permutation,
auto&&... perTripleCallbacks);

// _______________________________________________________________________
// Create a pair of permutations. Only works for valid pairs (PSO-POS,
// OSP-OPS, SPO-SOP). First creates the permutation and then exchanges the
// multiplicities and also writes the MetaData to disk. So we end up with
// fully functional permutations.
//
// TODO: The rest of this comment looks outdated.
//
// performUnique must be set for the first pair created using vec to enforce
// RDF standard (no duplicate triples).
// createPatternsAfterFirst is only valid when the pair is SPO-SOP because
// the SPO permutation is also needed for patterns (see usage in
// IndexImpl::createFromFile function)

void createPermutationPair(size_t numColumns, auto&& sortedTriples,
public:
template <typename SortedTriplesType, typename... CallbackTypes>
void createPermutationPair(size_t numColumns,
SortedTriplesType&& sortedTriples,
const Permutation& p1, const Permutation& p2,
auto&&... perTripleCallbacks);
CallbackTypes&&... perTripleCallbacks);
// void createPermutationPair(size_t numColumns, auto&& sortedTriples,
// const Permutation& p1, const Permutation& p2,
// auto&&... perTripleCallbacks);

private:
// wrapper for createPermutation that saves a lot of code duplications
// Writes the permutation that is specified by argument permutation
// performs std::unique on arg vec iff arg performUnique is true (normally
// done for first permutation that is created using vec).
// Will sort vec.
// returns the MetaData (MmapBased or HmapBased) for this relation.
// Careful: only multiplicities for first column is valid after call, need to
// call exchangeMultiplicities as done by createPermutationPair
// the optional is std::nullopt if vec and thus the index is empty
// Careful: only multiplicities for first column is valid after call, need
// to call exchangeMultiplicities as done by createPermutationPair the
// optional is std::nullopt if vec and thus the index is empty
std::pair<IndexMetaDataMmapDispatcher::WriteType,
IndexMetaDataMmapDispatcher::WriteType>
createPermutations(size_t numColumns, auto&& sortedTriples,
Expand Down Expand Up @@ -545,12 +553,13 @@ class IndexImpl {
MakeFromUint64t makeFromUint = MakeFromUint64t{}) const;

// Get the metadata for the block from the text index that contains the
// `word`. Also works for prefixes that are terminated with `PREFIX_CHAR` like
// "astro*". Returns `nullopt` if no suitable block was found because no
// matching word is contained in the text index. Some additional information
// is also returned that is often required by the calling functions:
// `hasToBeFiltered_` is true iff `word` is NOT the only word in the text
// block, and additional filtering is thus required. `idRange_` is the range
// `word`. Also works for prefixes that are terminated with `PREFIX_CHAR`
// like "astro*". Returns `nullopt` if no suitable block was found because
// no matching word is contained in the text index. Some additional
// information is also returned that is often required by the calling
// functions: `hasToBeFiltered_` is true iff `word` is NOT the only word in
// the text block, and additional filtering is thus required. `idRange_` is
// the range
// `[first, last]` of the `WordVocabIndex`es that correspond to the word
// (which might also be a prefix, thus it is a range).
struct TextBlockMetadataAndWordInfo {
Expand Down Expand Up @@ -612,8 +621,8 @@ class IndexImpl {

private:
/**
* @brief Throws an exception if no patterns are loaded. Should be called from
* whithin any index method that returns data requiring the patterns
* @brief Throws an exception if no patterns are loaded. Should be called
* from whithin any index method that returns data requiring the patterns
* file.
*/
void throwExceptionIfNoPatterns() const;
Expand All @@ -632,25 +641,26 @@ class IndexImpl {

public:
// Count the number of "QLever-internal" triples (predicate ql:langtag or
// predicate starts with @) and all other triples (that were actually part of
// the input).
// predicate starts with @) and all other triples (that were actually part
// of the input).
NumNormalAndInternal numTriples() const;

// The index contains several triples that are not part of the "actual"
// knowledge graph, but are added by QLever for internal reasons (e.g. for an
// efficient implementation of language filters). For a given
// knowledge graph, but are added by QLever for internal reasons (e.g. for
// an efficient implementation of language filters). For a given
// `Permutation::Enum`, returns the following `std::pair`:
//
// first: A `vector<pair<Id, Id>>` that denotes ranges in the first column
// of the permutation that imply that a triple is added. For example
// in the `SPO` and `SOP` permutation a literal subject means that the
// triple was added (literals are not legal subjects in RDF), so the
// pair `(idOfFirstLiteral, idOfLastLiteral + 1)` will be contained
// in the vector.
// in the `SPO` and `SOP` permutation a literal subject means that
// the triple was added (literals are not legal subjects in RDF), so
// the pair `(idOfFirstLiteral, idOfLastLiteral + 1)` will be
// contained in the vector.
// second: A lambda that checks for a triple *that is not already excluded
// by the ignored ranges from the first argument* whether it still
// is an added triple. For example in the `Sxx` and `Oxx` permutation
// a triple where the predicate starts with '@' (instead of the usual
// is an added triple. For example in the `Sxx` and `Oxx`
// permutation a triple where the predicate starts with '@' (instead
// of the usual
// '<' is an added triple from the language filter implementation.
//
// Note: A triple from a given permutation is an added triple if and only if
Expand Down Expand Up @@ -698,13 +708,13 @@ class IndexImpl {
auto isTripleIgnored = [permutation,
isInternalPredicateId](const auto& triple) {
// TODO<joka921, everybody in the future>:
// A lot of code (especially for statistical queries in `GroupBy.cpp` and
// the pattern trick) relies on this function being a noop for the `PSO`
// and `POS` permutations, meaning that it suffices to check the
// `ignoredRanges` for them. Should this ever change (which means that we
// add internal triples that use predicates that are actually contained in
// the knowledge graph), then all the code that uses this function has to
// be thoroughly reviewed.
// A lot of code (especially for statistical queries in `GroupBy.cpp`
// and the pattern trick) relies on this function being a noop for the
// `PSO` and `POS` permutations, meaning that it suffices to check the
// `ignoredRanges` for them. Should this ever change (which means that
// we add internal triples that use predicates that are actually
// contained in the knowledge graph), then all the code that uses this
// function has to be thoroughly reviewed.
if (permutation == SPO || permutation == OPS) {
// Predicates are always entities from the vocabulary.
return isInternalPredicateId(triple[1]);
Expand All @@ -718,12 +728,13 @@ class IndexImpl {
}
using BlocksOfTriples = cppcoro::generator<IdTableStatic<0>>;

// Functions to create the pairs of permutations during the index build. Each
// of them takes the following arguments:
// Functions to create the pairs of permutations during the index build.
// Each of them takes the following arguments:
// * `isQleverInternalId` a callable that takes an `Id` and returns true iff
// the corresponding IRI was internally added by QLever and not part of the
// knowledge graph.
// * `sortedInput` The input, must be sorted by the first permutation in the
// the corresponding IRI was internally added by QLever and not part of
// the knowledge graph.
// * `sortedInput` The input, must be sorted by the first permutation in
// the
// function name.
// * `nextSorter` A callback that is invoked for each row in each of the
// blocks in the input. Typically used to set up the sorting for the
Expand All @@ -746,17 +757,17 @@ class IndexImpl {
NextSorter&&... nextSorter);

// Create the PSO and POS permutations. Additionally, count the number of
// distinct predicates and the number of actual triples and write them to the
// metadata.
// distinct predicates and the number of actual triples and write them to
// the metadata.
template <typename... NextSorter>
requires(sizeof...(NextSorter) <= 1)
void createPSOAndPOS(size_t numColumns, auto& isInternalId,
BlocksOfTriples sortedTriples,
NextSorter&&... nextSorter);

// Set up one of the permutation sorters with the appropriate memory limit.
// The `permutationName` is used to determine the filename and must be unique
// for each call during one index build.
// The `permutationName` is used to determine the filename and must be
// unique for each call during one index build.
template <typename Comparator, size_t N = NumColumnsIndexBuilding>
ExternalSorter<Comparator, N> makeSorter(
std::string_view permutationName) const;
Expand All @@ -773,10 +784,10 @@ class IndexImpl {
// function names are consistent with the aliases for the sorters, i.e. that
// `createFirstPermutationPair` corresponds to the `FirstPermutation`.

// The `createFirstPermutationPair` has a special implementation for the case
// of only two permutations (where we have to build the Pxx permutations). In
// all other cases the Sxx permutations are built first because we need the
// patterns.
// The `createFirstPermutationPair` has a special implementation for the
// case of only two permutations (where we have to build the Pxx
// permutations). In all other cases the Sxx permutations are built first
// because we need the patterns.
std::optional<PatternCreator::TripleSorter> createFirstPermutationPair(
auto&&... args) {
static_assert(std::is_same_v<FirstPermutation, SortBySPO>);
Expand All @@ -799,12 +810,12 @@ class IndexImpl {
return createPSOAndPOS(AD_FWD(args)...);
}

// Build the OSP and OPS permutations from the output of the `PatternCreator`.
// The permutations will have two additional columns: The subject pattern of
// the subject (which is already created by the `PatternCreator`) and the
// subject pattern of the object (which is created by this function). Return
// these five columns sorted by PSO, to be used as an input for building the
// PSO and POS permutations.
// Build the OSP and OPS permutations from the output of the
// `PatternCreator`. The permutations will have two additional columns: The
// subject pattern of the subject (which is already created by the
// `PatternCreator`) and the subject pattern of the object (which is created
// by this function). Return these five columns sorted by PSO, to be used as
// an input for building the PSO and POS permutations.
std::unique_ptr<ExternalSorter<SortByPSO, 5>> buildOspWithPatterns(
PatternCreator::TripleSorter sortersFromPatternCreator,
auto isQLeverInternalId);
Expand Down
21 changes: 12 additions & 9 deletions src/index/IndexMetaData.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2015, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Björn Buchhold ([email protected])

#pragma once

#include <stdio.h>
Expand All @@ -13,14 +14,14 @@
#include <utility>
#include <vector>

#include "../global/Id.h"
#include "../util/File.h"
#include "../util/HashMap.h"
#include "../util/MmapVector.h"
#include "../util/ReadableNumberFact.h"
#include "../util/Serializer/Serializer.h"
#include "./MetaDataHandler.h"
#include "CompressedRelation.h"
#include "global/Id.h"
#include "index/CompressedRelation.h"
#include "index/MetaDataHandler.h"
#include "util/File.h"
#include "util/HashMap.h"
#include "util/MmapVector.h"
#include "util/ReadableNumberFact.h"
#include "util/Serializer/Serializer.h"

using std::array;
using std::pair;
Expand Down Expand Up @@ -87,6 +88,7 @@ class IndexMetaData {

// For each relation, its meta data.
MapType _data;

// For each compressed block, its meta data.
BlocksType _blockData;

Expand Down Expand Up @@ -222,7 +224,8 @@ template <class MapType>
ad_utility::File& operator<<(ad_utility::File& f,
const IndexMetaData<MapType>& imd);

// aliases for easier use in Index class
// Aliases for easier use in classes that build or query permutations, like
// `IndexImpl`.
using MetaWrapperMmap =
MetaDataWrapperDense<ad_utility::MmapVector<CompressedRelationMetadata>>;
using MetaWrapperMmapView = MetaDataWrapperDense<
Expand Down
Loading