From 84a4bdf9ee790390ffaea5d8eeebfbb52f61f8c7 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 9 Jun 2023 16:30:57 +0200 Subject: [PATCH 1/4] Code for locating triples in an existing index This is the first part of a series of PRs split of from the large proof-of-concept PR https://github.com/ad-freiburg/qlever/pull/916, which realizes SPARQL 1.1 Update --- src/global/IdTriple.h | 18 ++ src/index/CMakeLists.txt | 1 + src/index/CompressedRelation.h | 1 + src/index/IndexMetaData.h | 20 +- src/index/LocatedTriples.cpp | 349 +++++++++++++++++++++++++++++++++ src/index/LocatedTriples.h | 196 ++++++++++++++++++ src/index/MetaDataHandler.h | 51 +++-- test/CMakeLists.txt | 2 + test/LocatedTriplesTest.cpp | 173 ++++++++++++++++ 9 files changed, 789 insertions(+), 22 deletions(-) create mode 100644 src/global/IdTriple.h create mode 100644 src/index/LocatedTriples.cpp create mode 100644 src/index/LocatedTriples.h create mode 100644 test/LocatedTriplesTest.cpp diff --git a/src/global/IdTriple.h b/src/global/IdTriple.h new file mode 100644 index 0000000000..0353b8c747 --- /dev/null +++ b/src/global/IdTriple.h @@ -0,0 +1,18 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#pragma once + +#include + +#include "global/Id.h" + +// Should we have an own class for this? We need this at several places. +using IdTriple = std::array; + +// Hash value for such triple. +template +H AbslHashValue(H h, const IdTriple& triple) { + return H::combine(std::move(h), triple[0], triple[1], triple[2]); +} diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index 4bbf53f647..fd65af2bd4 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -8,6 +8,7 @@ add_library(index VocabularyOnDisk.h VocabularyOnDisk.cpp IndexMetaData.h IndexMetaDataImpl.h MetaDataHandler.h + LocatedTriples.h LocatedTriples.cpp StxxlSortFunctors.h TextMetaData.cpp TextMetaData.h DocsDB.cpp DocsDB.h diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 3c6c5df80a..63d39a28ba 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -305,6 +305,7 @@ class CompressedRelationReader { static void decompressColumn(const std::vector& compressedColumn, size_t numRowsToRead, Iterator iterator); + public: // Read the block that is identified by the `blockMetaData` from the `file`, // decompress and return it. // If `columnIndices` is `nullopt`, then all columns of the block are read, diff --git a/src/index/IndexMetaData.h b/src/index/IndexMetaData.h index 4e3ef4b38f..3039c0ba28 100644 --- a/src/index/IndexMetaData.h +++ b/src/index/IndexMetaData.h @@ -1,6 +1,7 @@ // Copyright 2015, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) + #pragma once #include @@ -13,14 +14,14 @@ #include #include -#include "../global/Id.h" -#include "../util/File.h" -#include "../util/HashMap.h" -#include "../util/MmapVector.h" -#include "../util/ReadableNumberFact.h" -#include "../util/Serializer/Serializer.h" -#include "./MetaDataHandler.h" -#include "CompressedRelation.h" +#include "global/Id.h" +#include "index/CompressedRelation.h" +#include "index/MetaDataHandler.h" +#include "util/File.h" +#include "util/HashMap.h" +#include "util/MmapVector.h" +#include "util/ReadableNumberFact.h" +#include "util/Serializer/Serializer.h" using std::array; using std::pair; @@ -86,7 +87,10 @@ class IndexMetaData { // name and the variable name are terrible. // For each relation, its meta data. + public: MapType _data; + + private: // For each compressed block, its meta data. BlocksType _blockData; diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp new file mode 100644 index 0000000000..acd6988675 --- /dev/null +++ b/src/index/LocatedTriples.cpp @@ -0,0 +1,349 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#include "index/LocatedTriples.h" + +#include + +#include "index/CompressedRelation.h" +#include "index/IndexMetaData.h" +#include "index/Permutations.h" + +// ____________________________________________________________________________ +LocatedTriple LocatedTriple::locateTripleInPermutation( + Id id1, Id id2, Id id3, const Permutation& permutation) { + // Get the internal data structures from the permutation. + auto& file = permutation._file; + const auto& meta = permutation._meta; + const auto& reader = permutation._reader; + + // Find the index of the first block where the last triple is not smaller. + // + // NOTE: Since `_col2LastId` has been added to `CompressedBlockMetadata`, this + // can be computed without having to decompress any blocks. + const vector& blocks = meta.blockData(); + auto matchingBlock = std::lower_bound( + blocks.begin(), blocks.end(), std::array{id1, id2, id3}, + [&](const CompressedBlockMetadata& block, const auto& triple) -> bool { + if (block.col0LastId_ < triple[0]) { + return true; + } else if (block.col0LastId_ == triple[0]) { + if (block.col1LastId_ < triple[1]) { + return true; + } else if (block.col1LastId_ == triple[1]) { + return block.col2LastId_ < triple[2]; + } + } + return false; + }); + size_t blockIndex = matchingBlock - blocks.begin(); + + // Preliminary `FindTripleResult` object with the correct `blockIndex` and + // `Id`s, and a special `rowIndexInBlock` (see below) and `existsInIndex` set + // to `false`. + LocatedTriple locatedTriple{blockIndex, NO_ROW_INDEX, id1, id2, id3, false}; + + // If all `Id`s from all blocks are smaller, we return the index of the last + // block plus one (typical "end" semantics) and the special row index + // `NO_ROW_INDEX` (see how this is considered in `mergeTriples`). + if (matchingBlock == blocks.end()) { + AD_CORRECTNESS_CHECK(blockIndex == blocks.size()); + return locatedTriple; + } + + // Read and decompress the block. + DecompressedBlock blockTuples = + reader.readAndDecompressBlock(*matchingBlock, file, std::nullopt); + + // Find the smallest relation `Id` that is not smaller than `id1` and get its + // metadata and the position of the first and last triple with that `Id` in + // the block. + // + // IMPORTANT: If relation `id1` exists in the index, but our triple is larger + // than all triples of that relation in the index and the last triple of that + // relation ends a block, then our block search above (correctly) landed us at + // the next block. We can detect this by checking whether the first relation + // `Id` of the block is larger than `id1` and then we should get the metadata + // for the `Id` and not for `id1` (which would pertain to a previous block). + // + // TODO: There is still a bug in `MetaDataWrapperHashMap::lower_bound`, + // which is relevant in the rare case where a triple is inserted with an + // `Id` for predicate that is not a new `Id`, but has not been used for a + // predicate in the original index. + // + // NOTE: Since we have already handled the case, where all `Id`s in the + // permutation are smaller, above, such a relation should exist. + Id searchId = + matchingBlock->col0FirstId_ > id1 ? matchingBlock->col0FirstId_ : id1; + const auto& it = meta._data.lower_bound(searchId); + AD_CORRECTNESS_CHECK(it != meta._data.end()); + Id id = it.getId(); + const auto& relationMetadata = meta.getMetaData(id); + size_t offsetBegin = relationMetadata.offsetInBlock_; + size_t offsetEnd = offsetBegin + relationMetadata.numRows_; + // Note: If the relation spans multiple blocks, we know that the block we + // found above contains only triples from that relation. + if (offsetBegin == std::numeric_limits::max()) { + offsetBegin = 0; + offsetEnd = blockTuples.size(); + } + AD_CORRECTNESS_CHECK(offsetBegin <= blockTuples.size()); + AD_CORRECTNESS_CHECK(offsetEnd <= blockTuples.size()); + + // If we have found `id1`, we can do a binary search in the portion of the + // block that pertains to it (note the special case mentioned above, where + // we are already at the beginning of the next block). + // + // Otherwise, `id` is the next larger `Id` and the position of the first + // triple of that relation is exactly the position we are looking for. + if (id == id1) { + locatedTriple.rowIndexInBlock = + std::lower_bound(blockTuples.begin() + offsetBegin, + blockTuples.begin() + offsetEnd, + std::array{id2, id3}, + [](const auto& a, const auto& b) { + return a[0] < b[0] || (a[0] == b[0] && a[1] < b[1]); + }) - + blockTuples.begin(); + // Check if the triple at the found position is equal to `id1 id2 id3`. + // Note that our default for `existsInIndex` was set to `false` above. + const size_t& i = locatedTriple.rowIndexInBlock; + AD_CORRECTNESS_CHECK(i < blockTuples.size()); + if (i < offsetEnd && blockTuples(i, 0) == id2 && blockTuples(i, 1) == id3) { + locatedTriple.existsInIndex = true; + } + } else { + AD_CORRECTNESS_CHECK(id1 < id); + locatedTriple.rowIndexInBlock = offsetBegin; + } + + // Return the result. + return locatedTriple; +} + +// ____________________________________________________________________________ +template +std::pair LocatedTriplesPerBlock::numTriplesImpl( + size_t blockIndex, Id id1, Id id2) const { + // If no located triples for `blockIndex` exist, there is no entry in `map_`. + if (!map_.contains(blockIndex)) { + return {0, 0}; + } + + // Otherwise iterate over all located triples and count how many of them exist + // in the index ("to be deleted") and how many are new ("to be inserted"). + size_t countExists = 0; + size_t countNew = 0; + for (const LocatedTriple& locatedTriple : map_.at(blockIndex)) { + // Helper lambda for increasing the right counter. + auto increaseCountIf = [&](bool increase) { + if (increase) { + if (locatedTriple.existsInIndex) { + ++countExists; + } else { + ++countNew; + } + } + }; + // Increase depending on the mode. + if constexpr (matchMode == MatchMode::MatchAll) { + increaseCountIf(true); + } else if constexpr (matchMode == MatchMode::MatchId1) { + increaseCountIf(locatedTriple.id1 == id1); + } else if constexpr (matchMode == MatchMode::MatchId1AndId2) { + increaseCountIf(locatedTriple.id1 == id1 && locatedTriple.id2 == id2); + } + } + return {countNew, countExists}; +} + +// ____________________________________________________________________________ +std::pair LocatedTriplesPerBlock::numTriples( + size_t blockIndex) const { + return numTriplesImpl(blockIndex); +} + +// ____________________________________________________________________________ +std::pair LocatedTriplesPerBlock::numTriples(size_t blockIndex, + Id id1) const { + return numTriplesImpl(blockIndex, id1); +} + +// ____________________________________________________________________________ +std::pair LocatedTriplesPerBlock::numTriples(size_t blockIndex, + Id id1, + Id id2) const { + return numTriplesImpl(blockIndex, id1, id2); +} + +// ____________________________________________________________________________ +template +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult, Id id1, + Id id2, size_t rowIndexInBlockBegin, + size_t rowIndexInBlockEnd) const { + // This method should only be called if there are located triples in the + // specified block. + AD_CONTRACT_CHECK(map_.contains(blockIndex)); + + // The special case `block == std::nullopt` (write only located triples to + // `result`) is only allowed, when `id1` or `id1` and `id2` are specified. + AD_CONTRACT_CHECK(block.has_value() || matchMode != MatchMode::MatchAll); + + // If `rowIndexInBlockEnd` has the default value (see `LocatedTriples.h`), the + // intended semantics is that we read the whole block (note that we can't have + // a default value that depends on the values of previous arguments). + if (rowIndexInBlockEnd == LocatedTriple::NO_ROW_INDEX && block.has_value()) { + rowIndexInBlockEnd = block.value().size(); + } + + // Check that `rowIndexInBlockBegin` and `rowIndexInBlockEnd` define a valid + // and non-emtpy range and that it is a subrange of `block` (unless the latter + // is `std::nullopt`). + if (block.has_value()) { + AD_CONTRACT_CHECK(rowIndexInBlockBegin < block.value().size()); + AD_CONTRACT_CHECK(rowIndexInBlockEnd <= block.value().size()); + } + AD_CONTRACT_CHECK(rowIndexInBlockBegin < rowIndexInBlockEnd); + + // If we restrict `id1` and `id2`, the index block and the result must have + // one column (for the `id3`). Otherwise, they must have two columns (for the + // `id2` and the `id3`). + if constexpr (matchMode == MatchMode::MatchId1AndId2) { + AD_CONTRACT_CHECK(!block.has_value() || block.value().numColumns() == 1); + AD_CONTRACT_CHECK(result.numColumns() == 1); + } else { + AD_CONTRACT_CHECK(!block.has_value() || block.value().numColumns() == 2); + AD_CONTRACT_CHECK(result.numColumns() == 2); + } + + auto resultEntry = result.begin() + offsetInResult; + const auto& locatedTriples = map_.at(blockIndex); + auto locatedTriple = locatedTriples.begin(); + + // Helper lambda that checks whether the given located triple should be + // considered, given the `matchMode`. + auto locatedTripleMatches = [&]() { + if constexpr (matchMode == MatchMode::MatchAll) { + return true; + } else if constexpr (matchMode == MatchMode::MatchId1) { + return locatedTriple->id1 == id1; + } else if constexpr (matchMode == MatchMode::MatchId1AndId2) { + return locatedTriple->id1 == id1 && locatedTriple->id2 == id2; + } + }; + + // Advance to the first located triple in the specified range. + while (locatedTriple != locatedTriples.end() && + locatedTriple->rowIndexInBlock < rowIndexInBlockBegin) { + ++locatedTriple; + } + + // Iterate over all located triples in the specified range. In the special + // case `block == std::nullopt` (only write located triples to `result`), all + // relevant located triples have `rowIndexInBlock == NO_ROW_INDEX` (here we + // need that `NO_ROW_INDEX` is the maximal `size_t` value minus one). + if (!block.has_value()) { + rowIndexInBlockBegin = LocatedTriple::NO_ROW_INDEX; + rowIndexInBlockEnd = rowIndexInBlockBegin + 1; + AD_CORRECTNESS_CHECK(rowIndexInBlockBegin < rowIndexInBlockEnd); + } + for (size_t rowIndex = rowIndexInBlockBegin; rowIndex < rowIndexInBlockEnd; + ++rowIndex) { + // Append triples that are marked for insertion at this `rowIndex` to the + // result. + while (locatedTriple != locatedTriples.end() && + locatedTriple->rowIndexInBlock == rowIndex && + locatedTriple->existsInIndex == false) { + if (locatedTripleMatches()) { + if constexpr (matchMode == MatchMode::MatchId1AndId2) { + (*resultEntry)[0] = locatedTriple->id3; + } else { + (*resultEntry)[0] = locatedTriple->id2; + (*resultEntry)[1] = locatedTriple->id3; + } + ++resultEntry; + } + ++locatedTriple; + } + + // Append the triple at this position to the result if and only if it is not + // marked for deletion and matches (also skip it if it does not match). + bool deleteThisEntry = false; + if (locatedTriple != locatedTriples.end() && + locatedTriple->rowIndexInBlock == rowIndex && + locatedTriple->existsInIndex == true) { + deleteThisEntry = locatedTripleMatches(); + ++locatedTriple; + } + if (block.has_value() && !deleteThisEntry) { + *resultEntry++ = block.value()[rowIndex]; + } + }; + + // Return the number of rows written to `result`. + return resultEntry - (result.begin() + offsetInResult); +} + +// ____________________________________________________________________________ +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult) const { + return mergeTriples(blockIndex, std::move(block), result, + offsetInResult); +} + +// ____________________________________________________________________________ +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult, Id id1, + size_t rowIndexInBlockBegin) const { + return mergeTriples( + blockIndex, std::move(block), result, offsetInResult, id1, + Id::makeUndefined(), rowIndexInBlockBegin); +} + +// ____________________________________________________________________________ +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult, Id id1, + Id id2, size_t rowIndexInBlockBegin, + size_t rowIndexInBlockEnd) const { + return mergeTriples( + blockIndex, std::move(block), result, offsetInResult, id1, id2, + rowIndexInBlockBegin, rowIndexInBlockEnd); +} + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt) { + os << "LT(" << lt.blockIndex << " " + << (lt.rowIndexInBlock == LocatedTriple::NO_ROW_INDEX + ? "NO_ROW_INDEX" + : std::to_string(lt.rowIndexInBlock)) + << " " << lt.id1 << " " << lt.id2 << " " << lt.id3 << " " + << lt.existsInIndex << ")"; + return os; +} + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const LocatedTriples& lts) { + os << "{"; + std::copy(lts.begin(), lts.end(), + std::ostream_iterator(std::cout, " ")); + os << "}"; + return os; +} + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const LocatedTriplesPerBlock& ltpb) { + for (auto [blockIndex, lts] : ltpb.map_) { + os << "Block #" << blockIndex << ": " << lts << std::endl; + } + return os; +} diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h new file mode 100644 index 0000000000..bb967bfe95 --- /dev/null +++ b/src/index/LocatedTriples.h @@ -0,0 +1,196 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#pragma once + +#include "engine/idTable/IdTable.h" +#include "global/IdTriple.h" +#include "util/HashMap.h" + +class Permutation; + +// A triple and its location in a particular permutation. +// +// If a triple is not contained in the permutation, the location is the location +// of the next larger triple (which may be in the next block or beyond the last +// block). For a detailed definition of all border cases, see the definition at +// the end of this file. +// +// NOTE: Technically, `blockIndex` and the `existsInIndex` are redundant in this +// record because they can be derived when the class is used. However, they are +// useful for testing, and for a small nuber of delta triples (think millions), +// space efficiency is not a significant issue for this class. +struct LocatedTriple { + // The index of the block and the location within that block, according to the + // definition above. + size_t blockIndex; + size_t rowIndexInBlock; + // The `Id`s of the triple in the order of the permutation. For example, + // for an object pertaining to the SPO permutation: `id1` is the subject, + // `id2` is the predicate, and `id3` is the object. + Id id1; + Id id2; + Id id3; + // Flag that is true if and only if the triple exists in the permutation. It + // is then equal to the triple at the position given by `blockIndex` and + // `rowIndexInBlock`. + bool existsInIndex; + + // Locate the given triple in the given permutation. + static LocatedTriple locateTripleInPermutation( + Id id1, Id id2, Id id3, const Permutation& permutation); + + // Special row index for triples that belong to the previous block (see the + // definition for the location of a triple at the end of this file). + // + // NOTE: It is important that `NO_ROW_INDEX + 1 > NO_ROW_INDEX`, hence it is + // defined as `max() - 1` and not as the seemingly more natural `max()`. + static const size_t NO_ROW_INDEX = std::numeric_limits::max() - 1; +}; + +// A sorted set of located triples. In `LocatedTriplesPerBlock` below, we use +// this to store all located triples with the same `blockIndex`. +// +// NOTE: We could also overload `std::less` here, but the explicit specification +// of the order makes it clearer. +struct LocatedTripleCompare { + bool operator()(const LocatedTriple& x, const LocatedTriple& y) const { + return IdTriple{x.id1, x.id2, x.id3} < IdTriple{y.id1, y.id2, y.id3}; + } +}; +using LocatedTriples = std::set; + +// Sorted sets of located triples, grouped by block. We use this to store all +// located triples for a permutation. +class LocatedTriplesPerBlock { + private: + // The total number of `LocatedTriple` objects stored (for all blocks). + size_t numTriples_ = 0; + + public: + // For each block with a non-empty set of located triples, the located triples + // in that block. + // + // NOTE: This is currently not private because we want access to + // `map_.size()`, `map_.clear()`, `map_.contains(...)`, and `map_.at(...)`. + // We could also make `LocatedTriplesPerBlock` a subclass of `HashMap`, but not sure whether that is good style. + ad_utility::HashMap map_; + + public: + // Get the number of located triples for the given block that match `id1` (if + // provided) and `id2` (if provided). The return value is a pair of numbers: + // first, the number of existing triples ("to be deleted") and second, the + // number of new triples ("to be inserted"). + std::pair numTriples(size_t blockIndex) const; + std::pair numTriples(size_t blockIndex, Id id1) const; + std::pair numTriples(size_t blockIndex, Id id1, Id id2) const; + + // Merge located triples for `blockIndex` with the given index `block` and + // write to `result`, starting from position `offsetInResult`. Consider only + // located triples in the range specified by `rowIndexInBlockBegin` and + // `rowIndexInBlockEnd`. Consider only triples that match `id1` (if provided) + // and `id2` (if provided). Return the number of rows written to `result`. + // + // PRECONDITIONS: + // + // 1. The set of located triples for `blockIndex` must be non-empty. + // Otherwise, there is no need for merging and this method shouldn't be + // called for efficiency reasons. + // + // 2. It is the resposibility of the caller that there is enough space for the + // result of the merge in `result` starting from `offsetInResult`. + // + // 3. If `block == std::nullopt`, we are adding to `result` the located + // triples for block `blockIndex` where the `rowIndexInBlock` is + // `NO_ROW_INDEX`. These actually belong to the previous block, but were + // larger than all triples there. This requires that `id1` or both `id1` and + // `id2` are specified. + // + size_t mergeTriples(size_t blockIndex, std::optional block, + IdTable& result, size_t offsetInResult) const; + size_t mergeTriples(size_t blockIndex, std::optional block, + IdTable& result, size_t offsetInResult, Id id1, + size_t rowIndexInBlockBegin = 0) const; + size_t mergeTriples( + size_t blockIndex, std::optional block, IdTable& result, + size_t offsetInResult, Id id1, Id id2, size_t rowIndexInBlockBegin = 0, + size_t rowIndexInBlockEnd = LocatedTriple::NO_ROW_INDEX) const; + + // Add the given `locatedTriple` to the given `LocatedTriplesPerBlock`. + // Return a handle to where it was added (`LocatedTriples` is a sorted set, + // see above). We need this handle so that we can easily remove the + // `locatedTriple` again from the set in case we need to. + // + // The `locatedTriple` must not already exist in `LocatedTriplesPerBlock`. + LocatedTriples::iterator add(const LocatedTriple& locatedTriple) { + LocatedTriples& locatedTriples = map_[locatedTriple.blockIndex]; + auto [handle, wasInserted] = locatedTriples.emplace(locatedTriple); + AD_CORRECTNESS_CHECK(wasInserted == true); + AD_CORRECTNESS_CHECK(handle != locatedTriples.end()); + ++numTriples_; + return handle; + }; + + // Get the total number of `LocatedTriple` objects (for all blocks). + size_t numTriples() const { return numTriples_; } + + // Get the number of blocks with a non-empty set of located triples. + size_t numBlocks() const { return map_.size(); } + + // Remove all located triples. + void clear() { + map_.clear(); + numTriples_ = 0; + } + + private: + // Match modes for `numTriplesInBlockImpl` and `mergeTriplesIntoBlockImpl`. + enum struct MatchMode { MatchAll, MatchId1, MatchId1AndId2 }; + + // The Implementation behind the public method `numTriplesInBlock` above. + template + std::pair numTriplesImpl(size_t blockIndex, + Id id1 = Id::makeUndefined(), + Id id2 = Id::makeUndefined()) const; + + // The Implementation behind the public method `mergeTriplesIntoBlock` above. + // The only reason that the arguments `id1` and `id2` come at the end here is + // so that we can give them default values. + template + size_t mergeTriples( + size_t blockIndex, std::optional block, IdTable& result, + size_t offsetInResult, Id id1 = Id::makeUndefined(), + Id id2 = Id::makeUndefined(), size_t rowIndexInBlockBegin = 0, + size_t rowIndexInBlockEnd = LocatedTriple::NO_ROW_INDEX) const; +}; + +// Human-readable representation of `LocatedTriple`, `LocatedTriples`, and +// `LocatedTriplesPerBlock`, which are very useful for debugging. +std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt); +std::ostream& operator<<(std::ostream& os, const LocatedTriples& lts); +std::ostream& operator<<(std::ostream& os, const LocatedTriplesPerBlock& ltpb); + +// DEFINITION OF THE POSITION OF A LOCATED TRIPLE IN A PERMUTATION +// +// 1. The position is defined by the index of a block in the permutation and the +// index of a row within that block. +// +// 2. If the triple in contained in the permutation, it is contained exactly +// once and so there is a well defined block and position in that block. +// +// 2. If there is a block, where the first triple is smaller and the last triple +// is larger, then that is the block and the position in that block is that of +// the first triple that is (not smaller and hence) larger. +// +// 3. If the triple falls "between two blocks" (the last triple of the previous +// block is smaller and the first triple of the next block is larger), then the +// position is the first position in that next block. +// +// 4. As a special case of 3, if the triple is smaller than all triples in the +// permutation, the position is the first position of the first block. +// +// 5. If the triple is larger than all triples in the permutation, the block +// index is one after the largest block index and the position within that +// non-existing block is arbitrary. diff --git a/src/index/MetaDataHandler.h b/src/index/MetaDataHandler.h index da84f1158a..e24e33fe5c 100644 --- a/src/index/MetaDataHandler.h +++ b/src/index/MetaDataHandler.h @@ -1,29 +1,39 @@ -// Copyright 2018, University of Freiburg, +// Copyright 2018 - 2023, University of Freiburg // Chair of Algorithms and Data Structures -// Author: Johannes Kalmbach (johannes.kalmbach@gmail.com) -// +// Authors: Johannes Kalmbach +// Hannah Bast + #pragma once #include #include -#include "../global/Id.h" -#include "../util/Exception.h" -#include "../util/HashMap.h" -#include "../util/Iterators.h" -#include "../util/Log.h" -#include "../util/Serializer/Serializer.h" -#include "./CompressedRelation.h" - -// _____________________________________________________________________ +#include "global/Id.h" +#include "index/CompressedRelation.h" +#include "util/Exception.h" +#include "util/HashMap.h" +#include "util/Iterators.h" +#include "util/Log.h" +#include "util/Serializer/Serializer.h" + +// Class for access to relation metadata stored in a vector. Specifically, our +// index uses this with `M = MmapVector>`; see +// `index/IndexMetaData.h` template class MetaDataWrapperDense { + private: + // A vector of metadata objects. + M _vec; + public: + // An iterator with an additional method `getId()` that gives the relation ID + // of the current metadata object. template struct AddGetIdIterator : BaseIterator { using BaseIterator::BaseIterator; AddGetIdIterator(BaseIterator base) : BaseIterator{base} {} [[nodiscard]] Id getId() const { return getIdFromElement(*(*this)); } + [[nodiscard]] const auto& getMetaData() const { return *(*this); } static Id getIdFromElement(const typename BaseIterator::value_type& v) { return v.col0Id_; } @@ -39,6 +49,7 @@ class MetaDataWrapperDense { // The underlying array is sorted, so all iterators are ordered iterators using ConstOrderedIterator = ConstIterator; + // The type of the stored metadata objects. using value_type = typename M::value_type; // _________________________________________________________ @@ -109,12 +120,24 @@ class MetaDataWrapperDense { // ___________________________________________________________ std::string getFilename() const { return _vec.getFilename(); } - private: + // The following used to be private (because they were only used as + // subroutines in the above), but we now need them in + // `DeltaTriples::findTripleResult`. ConstIterator lower_bound(Id id) const { auto cmp = [](const auto& metaData, Id id) { return metaData.col0Id_ < id; }; return std::lower_bound(_vec.begin(), _vec.end(), id, cmp); } - M _vec; + Iterator lower_bound(Id id) { + auto cmp = [](const auto& metaData, Id id) { + return metaData.col0Id_ < id; + }; + return std::lower_bound(_vec.begin(), _vec.end(), id, cmp); + } }; + +// ======= +// M _vec; +// }; +// >>>>>>> master diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f94b54c063..c81ecaa9c5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -104,6 +104,8 @@ addLinkAndDiscoverTest(IndexMetaDataTest index) # TODO fix this addLinkAndDiscoverTestSerial(IndexTest index) +addLinkAndDiscoverTestSerial(LocatedTriplesTest index) + addLinkAndDiscoverTest(FTSAlgorithmsTest index) addLinkAndDiscoverTest(EngineTest engine) diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp new file mode 100644 index 0000000000..ce4d0b909e --- /dev/null +++ b/test/LocatedTriplesTest.cpp @@ -0,0 +1,173 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Hannah Bast + +#include + +#include "./util/IdTableHelpers.h" +#include "./util/IdTestHelpers.h" +#include "index/CompressedRelation.h" +#include "index/IndexMetaData.h" +#include "index/LocatedTriples.h" +#include "index/Permutations.h" + +// TODO: Why the namespace here? (copied from `test/IndexMetaDataTest.cpp`) +namespace { +auto V = ad_utility::testing::VocabId; +} + +// Fixture with helper functions. +class LocatedTriplesTest : public ::testing::Test { + protected: + // Make `LocatedTriplesPerBlock` from a list of `LocatedTriple` objects (the + // order in which the objects are given does not matter). + LocatedTriplesPerBlock makeLocatedTriplesPerBlock( + std::vector locatedTriples) { + LocatedTriplesPerBlock result; + for (auto locatedTriple : locatedTriples) { + result.add(locatedTriple); + } + return result; + } +}; + +// Test the method that counts the number of `LocatedTriple's in a block. +TEST_F(LocatedTriplesTest, numTriplesInBlock) { + // Set up lists of located triples for three blocks. + auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( + {LocatedTriple{1, 0, V(10), V(1), V(0), true}, + LocatedTriple{1, 0, V(10), V(2), V(1), true}, + LocatedTriple{1, 0, V(11), V(3), V(0), false}, + LocatedTriple{2, 0, V(20), V(4), V(0), false}, + LocatedTriple{2, 0, V(21), V(5), V(0), false}, + LocatedTriple{3, 0, V(30), V(6), V(0), false}, + LocatedTriple{3, 0, V(32), V(7), V(0), true}}); + ASSERT_EQ(locatedTriplesPerBlock.numBlocks(), 3); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(), 7); + + auto P = [](size_t n1, size_t n2) -> std::pair { + return {n1, n2}; + }; + + // Check the total counts per block. + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1), P(1, 2)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2), P(2, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3), P(1, 1)); + + // Check the counts per block for a given `id1`. + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10)), P(0, 2)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(11)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(20)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(21)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(30)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(32)), P(0, 1)); + + // Check the counts per block for a given `id1` and `id2`. + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10), V(1)), P(0, 1)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10), V(2)), P(0, 1)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(11), V(3)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(20), V(4)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(21), V(5)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(30), V(6)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(32), V(7)), P(0, 1)); +} + +// Test the method that merges the matching `LocatedTriple`s from a block into a +// part of an `IdTable`. +TEST_F(LocatedTriplesTest, mergeTriples) { + // A block, as it could come from an index scan. + IdTable block = makeIdTableFromVector({{10, 10}, // Row 0 + {15, 20}, // Row 1 + {15, 30}, // Row 2 + {20, 10}, // Row 3 + {30, 20}, // Row 4 + {30, 30}}); // Row 5 + + // A set of located triples for that block. + auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( + {LocatedTriple{1, 0, V(1), V(10), V(10), true}, // Delete row 0 + LocatedTriple{1, 1, V(1), V(10), V(11), false}, // Insert before row 1 + LocatedTriple{1, 1, V(2), V(11), V(10), false}, // Insert before row 1 + LocatedTriple{1, 4, V(2), V(21), V(11), false}, // Insert before row 4 + LocatedTriple{1, 4, V(2), V(30), V(10), false}, // Insert before row 4 + LocatedTriple{1, 4, V(2), V(30), V(20), true}, // Delete row 4 + LocatedTriple{1, 5, V(3), V(30), V(30), true}}); // Delete row 5 + + // Merge all these triples into `block` and check that the result is as + // expected (four triples inserted and three triples deleted). + { + IdTable resultExpected = makeIdTableFromVector({{10, 11}, // Row 0 + {11, 10}, // Row 1 + {15, 20}, // Row 2 + {15, 30}, // Row 3 + {20, 10}, // Row 4 + {21, 11}, // Row 5 + {30, 10}}); // Row 6 + IdTable result(2, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, block.clone(), result, 0); + ASSERT_EQ(result, resultExpected); + } + + // Merge only the triples with `id1 == V(2)` into `block` (three triples + // inserted and one triple deleted). + { + IdTable resultExpected = makeIdTableFromVector({{10, 10}, // Row 0 + {11, 10}, // Row 1 + {15, 20}, // Row 2 + {15, 30}, // Row 3 + {20, 10}, // Row 4 + {21, 11}, // Row 5 + {30, 10}, // Row 6 + {30, 30}}); // Row 7 + IdTable result(2, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, block.clone(), result, 0, V(2)); + ASSERT_EQ(result, resultExpected); + } + + // Repeat but with a partial block that leaves out the first two elements of + // `block`. + { + IdTable resultExpected = makeIdTableFromVector({{15, 30}, // Row 0 + {20, 10}, // Row 1 + {21, 11}, // Row 2 + {30, 10}, // Row 3 + {30, 30}}); // Row 4 + IdTable result(2, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, block.clone(), result, 0, V(2), 2); + ASSERT_EQ(result, resultExpected); + } + + // Merge only the triples with `id1 == V(2)` and `id2 == V(30)` into the + // corresponding partial block (one triple inserted, one triple deleted). + { + IdTable blockColumnId3(1, ad_utility::testing::makeAllocator()); + blockColumnId3.resize(block.size()); + for (size_t i = 0; i < block.size(); ++i) { + blockColumnId3(i, 0) = block(i, 1); + } + IdTable resultExpected = makeIdTableFromVector({{10}, {30}}); + IdTable result(1, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, std::move(blockColumnId3), result, 0, + V(2), V(30), 4, 6); + ASSERT_EQ(result, resultExpected); + } + + // Merge special triples. + { + size_t NRI = LocatedTriple::NO_ROW_INDEX; + auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( + {LocatedTriple{2, NRI, V(1), V(30), V(40), true}, + LocatedTriple{2, NRI, V(1), V(30), V(50), true}, + LocatedTriple{2, NRI, V(1), V(40), V(10), true}}); + IdTable resultExpected = makeIdTableFromVector({{30, 40}, // Row 0 + {30, 50}, // Row 1 + {40, 10}}); // Row 2 + IdTable result(2, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(2, std::nullopt, result, 0, V(1)); + } +} From d8781a4c6f97dfe1a0607b68248339727a475ca0 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sat, 10 Jun 2023 18:40:07 +0200 Subject: [PATCH 2/4] Add test for the `locatedTriple` method + address some of the comments from Johannes' review --- src/global/ValueId.h | 4 +- src/index/IndexMetaData.h | 3 +- src/index/LocatedTriples.cpp | 40 ++++++-- src/index/LocatedTriples.h | 3 + src/index/MetaDataHandler.h | 25 ++--- test/LocatedTriplesTest.cpp | 173 ++++++++++++++++++++++++++++++++++ test/ValueIdTest.cpp | 14 +-- test/ValuesForTestingTest.cpp | 4 +- 8 files changed, 236 insertions(+), 30 deletions(-) diff --git a/src/global/ValueId.h b/src/global/ValueId.h index b51d0aef16..26ce023ab8 100644 --- a/src/global/ValueId.h +++ b/src/global/ValueId.h @@ -282,10 +282,10 @@ class ValueId { /// This operator is only for debugging and testing. It returns a /// human-readable representation. friend std::ostream& operator<<(std::ostream& ostr, const ValueId& id) { - ostr << toString(id.getDatatype()) << ':'; + ostr << toString(id.getDatatype())[0] << ':'; auto visitor = [&ostr](T&& value) { if constexpr (ad_utility::isSimilar) { - ostr << "Undefined"; + ostr << "xx"; } else if constexpr (ad_utility::isSimilar || ad_utility::isSimilar) { ostr << std::to_string(value); diff --git a/src/index/IndexMetaData.h b/src/index/IndexMetaData.h index 3039c0ba28..9842faeb69 100644 --- a/src/index/IndexMetaData.h +++ b/src/index/IndexMetaData.h @@ -226,7 +226,8 @@ template ad_utility::File& operator<<(ad_utility::File& f, const IndexMetaData& imd); -// aliases for easier use in Index class +// Aliases for easier use in classes that build or query permutations, like +// `IndexImpl`. using MetaWrapperMmap = MetaDataWrapperDense>; using MetaWrapperMmapView = MetaDataWrapperDense< diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index acd6988675..c8aef6c363 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -6,6 +6,7 @@ #include +#include "absl/strings/str_join.h" #include "index/CompressedRelation.h" #include "index/IndexMetaData.h" #include "index/Permutations.h" @@ -190,7 +191,8 @@ size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, AD_CONTRACT_CHECK(map_.contains(blockIndex)); // The special case `block == std::nullopt` (write only located triples to - // `result`) is only allowed, when `id1` or `id1` and `id2` are specified. + // `result`) is only allowed, when the `matchMode` is `MatchId1` or + // `MatchId1AndId2`, but not `MatchAll`. AD_CONTRACT_CHECK(block.has_value() || matchMode != MatchMode::MatchAll); // If `rowIndexInBlockEnd` has the default value (see `LocatedTriples.h`), the @@ -324,7 +326,7 @@ size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt) { os << "LT(" << lt.blockIndex << " " << (lt.rowIndexInBlock == LocatedTriple::NO_ROW_INDEX - ? "NO_ROW_INDEX" + ? "x" : std::to_string(lt.rowIndexInBlock)) << " " << lt.id1 << " " << lt.id2 << " " << lt.id3 << " " << lt.existsInIndex << ")"; @@ -333,17 +335,43 @@ std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt) { // ____________________________________________________________________________ std::ostream& operator<<(std::ostream& os, const LocatedTriples& lts) { - os << "{"; + os << "{ "; std::copy(lts.begin(), lts.end(), - std::ostream_iterator(std::cout, " ")); + std::ostream_iterator(os, " ")); os << "}"; return os; } // ____________________________________________________________________________ std::ostream& operator<<(std::ostream& os, const LocatedTriplesPerBlock& ltpb) { - for (auto [blockIndex, lts] : ltpb.map_) { - os << "Block #" << blockIndex << ": " << lts << std::endl; + // Get the block indices in sorted order. + std::vector blockIndices; + std::transform(ltpb.map_.begin(), ltpb.map_.end(), + std::back_inserter(blockIndices), + [](const auto& entry) { return entry.first; }); + std::ranges::sort(blockIndices); + for (auto blockIndex : blockIndices) { + os << "Block #" << blockIndex << ": " << ltpb.map_.at(blockIndex) + << std::endl; } return os; } + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, + const columnBasedIdTable::Row& idTableRow) { + os << "("; + for (size_t i = 0; i < idTableRow.numColumns(); ++i) { + os << idTableRow[i] << (i < idTableRow.numColumns() - 1 ? " " : ")"); + } + return os; +} + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const IdTable& idTable) { + os << "{ "; + std::copy(idTable.begin(), idTable.end(), + std::ostream_iterator>(os, " ")); + os << "}"; + return os; +} diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h index bb967bfe95..e2a9735dc4 100644 --- a/src/index/LocatedTriples.h +++ b/src/index/LocatedTriples.h @@ -171,6 +171,9 @@ class LocatedTriplesPerBlock { std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt); std::ostream& operator<<(std::ostream& os, const LocatedTriples& lts); std::ostream& operator<<(std::ostream& os, const LocatedTriplesPerBlock& ltpb); +std::ostream& operator<<(std::ostream& os, + const columnBasedIdTable::Row& idTableRow); +std::ostream& operator<<(std::ostream& os, const IdTable& idTable); // DEFINITION OF THE POSITION OF A LOCATED TRIPLE IN A PERMUTATION // diff --git a/src/index/MetaDataHandler.h b/src/index/MetaDataHandler.h index e24e33fe5c..437f0e3e71 100644 --- a/src/index/MetaDataHandler.h +++ b/src/index/MetaDataHandler.h @@ -16,9 +16,16 @@ #include "util/Log.h" #include "util/Serializer/Serializer.h" -// Class for access to relation metadata stored in a vector. Specifically, our -// index uses this with `M = MmapVector>`; see -// `index/IndexMetaData.h` +// Wrapper class for access to `CompressedRelationMetadata` objects (one per +// relation) stored in a vector. Specifically, our index uses this with `M = +// MmapVector>`; see `index/IndexMetaData.h` at the +// bottom. +// +// TODO: We needed this at some point because we used to have two implementation +// of `IndexMetaData`, one using mmaps and one using hash maps, and we wanted to +// have a common interface for both. We no longer use the hash map +// implementation and so the wrapper class (and the complexity that goes along +// with it) is probably no longer needed. template class MetaDataWrapperDense { private: @@ -99,7 +106,7 @@ class MetaDataWrapperDense { // ____________________________________________________________ void set(Id id, const value_type& value) { - // Assert that the ids are ascending. + // Check that the `Id`s are added in strictly ascending order. AD_CONTRACT_CHECK(_vec.size() == 0 || _vec.back().col0Id_ < id); _vec.push_back(value); } @@ -120,9 +127,8 @@ class MetaDataWrapperDense { // ___________________________________________________________ std::string getFilename() const { return _vec.getFilename(); } - // The following used to be private (because they were only used as - // subroutines in the above), but we now need them in - // `DeltaTriples::findTripleResult`. + // NOTE: The following used to be private (they were only used as subroutines + // in the above), but we now need them in `LocatedTriples::locateTriple`. ConstIterator lower_bound(Id id) const { auto cmp = [](const auto& metaData, Id id) { return metaData.col0Id_ < id; @@ -136,8 +142,3 @@ class MetaDataWrapperDense { return std::lower_bound(_vec.begin(), _vec.end(), id, cmp); } }; - -// ======= -// M _vec; -// }; -// >>>>>>> master diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index ce4d0b909e..1fc681170a 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -171,3 +171,176 @@ TEST_F(LocatedTriplesTest, mergeTriples) { locatedTriplesPerBlock.mergeTriples(2, std::nullopt, result, 0, V(1)); } } + +// Test the locating of triples in a permutation using `locatedTriple`. +TEST_F(LocatedTriplesTest, locatedTriple) { + // The actual test, for a given block size. + // + // TODO: Also make the permutation an argument, right now it's only PSO. + auto testWithGivenBlockSize = + [](const IdTable& triplesInIndex, const IdTable& triplesToLocate, + size_t blockSizeInBytes, + const ad_utility::HashMap& + expectedLocatedTriplesPerBlock) { + std::string basename = "LocatedTriplesTest.scanWithMergeTriples"; + std::string permutationFilename = basename + ".index.pso"; + + // We currently assume that all triples in `triplesInIndex` have the + // same `Id` in the first column. + std::vector relationIds = {triplesInIndex(0, 0)}; + for (size_t i = 1; i < triplesInIndex.size(); ++i) { + ASSERT_EQ(triplesInIndex(i, 0), relationIds[0]); + } + + // Helper lambda for creating a `BufferedIdTable` from all triples in + // the given `IdTable` matching `relationId`. + // + // This is needed need for `CompressedRelationWriter` below, which + // expects a `BufferedIdTable` with two columns. + // + // TODO: Something like this is also used in `CompressedRelationsTest`, + // so it should be in a helper class. + auto getBufferedIdTable = [](const IdTable& idTable, + Id relationId) -> BufferedIdTable { + // Note that these files are never created because we set the + // threshold for writing to disk so large. + std::string bufferFilename1 = "compressedRelationWriter.buffer1.dat"; + std::string bufferFilename2 = "compressedRelationWriter.buffer2.dat"; + AD_CONTRACT_CHECK(idTable.numColumns() == 3); + BufferedIdTable bufferedIdTable{ + 2, std::array{ + ad_utility::BufferedVector{ + std::numeric_limits::max(), bufferFilename1}, + ad_utility::BufferedVector{ + std::numeric_limits::max(), bufferFilename2}}}; + for (size_t i = 0; i < idTable.size(); ++i) { + if (idTable(i, 0) == relationId) { + bufferedIdTable.push_back({idTable(i, 1), idTable(i, 2)}); + } + } + return bufferedIdTable; + }; + + // Write the permutation to disk (adapted from + // `CompressedRelationsTest`, `IndexImpl::createPermutationPairImpl`, + // and `IndexImpl::). + { + ad_utility::File permutationFileForWritingRelations{ + permutationFilename, "w"}; + IndexMetaDataMmap metadataMmap; + metadataMmap.setup(permutationFilename + MMAP_FILE_SUFFIX, + ad_utility::CreateTag{}); + CompressedRelationWriter writer{ + std::move(permutationFileForWritingRelations), blockSizeInBytes}; + for (size_t i = 0; i < relationIds.size(); ++i) { + // The third argument is the number of distinct elements. We set it + // to 1 here because it is irrelevant for the purposes of this test. + Id relationId = relationIds[i]; + auto relationMetadata = writer.addRelation( + relationId, getBufferedIdTable(triplesInIndex, relationId), 1); + metadataMmap.add(relationMetadata); + } + metadataMmap.blockData() = std::move(writer).getFinishedBlocks(); + ad_utility::File permutationFileForWritingMetadata( + permutationFilename, "r+"); + metadataMmap.appendToFile(&permutationFileForWritingMetadata); + } + + // Create a permutation based on this. + Permutation permutation{"PSO", ".pso", {1, 0, 2}}; + permutation.loadFromDisk(basename); + + // Check that the permutation indeed consists of the relations that we + // have written to it. + { + IdTable result(2, ad_utility::testing::makeAllocator()); + for (Id relationId : relationIds) { + permutation.scan(relationId, &result); + std::cout << "Relation " << relationId << ": " << result + << std::endl; + } + } + + // Now locate the triples from `triplesToLocate` in the permutation. + LocatedTriplesPerBlock locatedTriplesPerBlock; + for (size_t i = 0; i < triplesToLocate.size(); ++i) { + locatedTriplesPerBlock.add(LocatedTriple::locateTripleInPermutation( + triplesToLocate(i, 0), triplesToLocate(i, 1), + triplesToLocate(i, 2), permutation)); + } + + std::cout << locatedTriplesPerBlock; + for (auto [blockIndex, locatedTriplesString] : + expectedLocatedTriplesPerBlock) { + ASSERT_TRUE(locatedTriplesPerBlock.map_.contains(blockIndex)) + << "blockIndex = " << blockIndex << " not found"; + std::ostringstream os; + os << locatedTriplesPerBlock.map_.at(blockIndex); + ASSERT_EQ(os.str(), locatedTriplesString) + << "blockIndex = " << blockIndex; + } + + // Delete the permutation files. + ad_utility::deleteFile(permutationFilename); + ad_utility::deleteFile(permutationFilename + MMAP_FILE_SUFFIX); + }; + + // Triples in the index. + IdTable triplesInIndex = makeIdTableFromVector({{1, 10, 10}, // Row 0 + {1, 15, 20}, // Row 1 + {1, 15, 30}, // Row 2 + {1, 20, 10}, // Row 3 + {1, 30, 20}, // Row 4 + {1, 30, 30}}); // Row 5 + + // Locate the following triples, some of which exist in the relation and some + // of which do not, and which cover a variety of positons, including triples + // that are larger than all existing triples. + IdTable triplesToLocate = + makeIdTableFromVector({{1, 15, 20}, // Exists. + {1, 14, 20}, // Does not exist. + {1, 20, 10}, // Exists. + {1, 30, 20}, // Exists. + {1, 30, 30}, // Exists. + {1, 30, 31}, // Larger than all existing. + {1, 30, 32}}); // Larger than all existing. + + // Now test for multiple block sizes (16 bytes is the minimum). + // testing::internal::CaptureStdout(); + std::cout << "Index triples: " << triplesInIndex << std::endl; + std::cout << "Delta triples: " << triplesToLocate << std::endl; + + // With block size 16, we have each triple in its own block. + testWithGivenBlockSize( + triplesInIndex, triplesToLocate, 16, + {{1, "{ LT(1 0 V:1 V:14 V:20 0) LT(1 0 V:1 V:15 V:20 1) }"}, + {3, "{ LT(3 0 V:1 V:20 V:10 1) }"}, + {4, "{ LT(4 0 V:1 V:30 V:20 1) }"}, + {5, "{ LT(5 0 V:1 V:30 V:30 1) }"}, + {6, "{ LT(6 x V:1 V:30 V:31 0) LT(6 x V:1 V:30 V:32 0) }"}}); + + // With block size 32, we have three blocks à two triples each. + testWithGivenBlockSize( + triplesInIndex, triplesToLocate, 32, + {{0, "{ LT(0 1 V:1 V:14 V:20 0) LT(0 1 V:1 V:15 V:20 1) }"}, + {1, "{ LT(1 1 V:1 V:20 V:10 1) }"}, + {2, "{ LT(2 0 V:1 V:30 V:20 1) LT(2 1 V:1 V:30 V:30 1) }"}, + {3, "{ LT(3 x V:1 V:30 V:31 0) LT(3 x V:1 V:30 V:32 0) }"}}); + + // With block size 48, we have two blocks à three triples each. + testWithGivenBlockSize( + triplesInIndex, triplesToLocate, 48, + {{0, "{ LT(0 1 V:1 V:14 V:20 0) LT(0 1 V:1 V:15 V:20 1) }"}, + {1, + "{ LT(1 0 V:1 V:20 V:10 1) LT(1 1 V:1 V:30 V:20 1)" + " LT(1 2 V:1 V:30 V:30 1) }"}, + {2, "{ LT(2 x V:1 V:30 V:31 0) LT(2 x V:1 V:30 V:32 0) }"}}); + + // With block size 100'000, we have one block. + testWithGivenBlockSize( + triplesInIndex, triplesToLocate, 100'000, + {{0, + "{ LT(0 1 V:1 V:14 V:20 0) LT(0 1 V:1 V:15 V:20 1) LT(0 3 V:1 V:20 " + "V:10 1) LT(0 4 V:1 V:30 V:20 1) LT(0 5 V:1 V:30 V:30 1) }"}, + {1, "{ LT(1 x V:1 V:30 V:31 0) LT(1 x V:1 V:30 V:32 0) }"}}); +} diff --git a/test/ValueIdTest.cpp b/test/ValueIdTest.cpp index dab815e207..3963e6eca5 100644 --- a/test/ValueIdTest.cpp +++ b/test/ValueIdTest.cpp @@ -278,15 +278,15 @@ TEST(ValueId, toDebugString) { stream << id; ASSERT_EQ(stream.str(), expected); }; - test(ValueId::makeUndefined(), "Undefined:Undefined"); - test(ValueId::makeFromInt(-42), "Int:-42"); - test(ValueId::makeFromDouble(42.0), "Double:42.000000"); - test(makeVocabId(15), "VocabIndex:15"); - test(makeLocalVocabId(25), "LocalVocabIndex:25"); - test(makeTextRecordId(37), "TextRecordIndex:37"); + test(ValueId::makeUndefined(), "U:xx"); + test(ValueId::makeFromInt(-42), "I:-42"); + test(ValueId::makeFromDouble(42.0), "D:42.000000"); + test(makeVocabId(15), "V:15"); + test(makeLocalVocabId(25), "L:25"); + test(makeTextRecordId(37), "T:37"); test(ValueId::makeFromDate( DateOrLargeYear{123456, DateOrLargeYear::Type::Year}), - "Date:123456"); + "D:123456"); } TEST(ValueId, InvalidDatatypeEnumValue) { diff --git a/test/ValuesForTestingTest.cpp b/test/ValuesForTestingTest.cpp index 44d95a3cc6..42e1ec7258 100644 --- a/test/ValuesForTestingTest.cpp +++ b/test/ValuesForTestingTest.cpp @@ -27,8 +27,8 @@ TEST(ValuesForTesting, valuesForTesting) { ASSERT_EQ(v.getMultiplicity(1), 84.0); ASSERT_THAT(v.asString(), - ::testing::StartsWith("Values for testing with 2 columns and " - "contents VocabIndex:3 VocabIndex:12")); + ::testing::StartsWith( + "Values for testing with 2 columns and contents V:3 V:12")); ASSERT_EQ(v.getDescriptor(), "explicit values for testing"); ASSERT_TRUE(v.resultSortedOn().empty()); ASSERT_TRUE(v.getChildren().empty()); From 258231d49d55dcfc4ea49e55c563d9c9387ad4ac Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sat, 10 Jun 2023 21:50:34 +0200 Subject: [PATCH 3/4] Improve the `locateTriple` test (three relations instead of just one) --- test/LocatedTriplesTest.cpp | 109 +++++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 44 deletions(-) diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index 1fc681170a..a1d3f2136a 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -185,12 +185,13 @@ TEST_F(LocatedTriplesTest, locatedTriple) { std::string basename = "LocatedTriplesTest.scanWithMergeTriples"; std::string permutationFilename = basename + ".index.pso"; - // We currently assume that all triples in `triplesInIndex` have the - // same `Id` in the first column. - std::vector relationIds = {triplesInIndex(0, 0)}; - for (size_t i = 1; i < triplesInIndex.size(); ++i) { - ASSERT_EQ(triplesInIndex(i, 0), relationIds[0]); + // Collect the distinct relation `Id`s. + std::vector relationIds; + for (size_t i = 0; i < triplesInIndex.numRows(); ++i) { + relationIds.push_back(triplesInIndex(i, 0)); + ASSERT_TRUE(i == 0 || relationIds[i - 1] <= relationIds[i]); } + relationIds = ad_utility::removeDuplicates(relationIds); // Helper lambda for creating a `BufferedIdTable` from all triples in // the given `IdTable` matching `relationId`. @@ -256,7 +257,7 @@ TEST_F(LocatedTriplesTest, locatedTriple) { IdTable result(2, ad_utility::testing::makeAllocator()); for (Id relationId : relationIds) { permutation.scan(relationId, &result); - std::cout << "Relation " << relationId << ": " << result + std::cout << "Relation: " << relationId << " -> " << result << std::endl; } } @@ -269,16 +270,28 @@ TEST_F(LocatedTriplesTest, locatedTriple) { triplesToLocate(i, 2), permutation)); } + // Check that the locations are as expected. Process in order of + // increasing block index because it's easier to debug. std::cout << locatedTriplesPerBlock; - for (auto [blockIndex, locatedTriplesString] : + std::vector blockIndices; + for (auto [blockIndex, expectedLocatedTriples] : expectedLocatedTriplesPerBlock) { + blockIndices.push_back(blockIndex); + } + std::sort(blockIndices.begin(), blockIndices.end()); + for (auto blockIndex : blockIndices) { ASSERT_TRUE(locatedTriplesPerBlock.map_.contains(blockIndex)) << "blockIndex = " << blockIndex << " not found"; std::ostringstream os; os << locatedTriplesPerBlock.map_.at(blockIndex); - ASSERT_EQ(os.str(), locatedTriplesString) + std::string computedLocatedTriples = os.str(); + std::string expectedLocatedTriples = + expectedLocatedTriplesPerBlock.at(blockIndex); + ASSERT_EQ(computedLocatedTriples, expectedLocatedTriples) << "blockIndex = " << blockIndex; } + ASSERT_EQ(locatedTriplesPerBlock.map_.size(), + expectedLocatedTriplesPerBlock.size()); // Delete the permutation files. ad_utility::deleteFile(permutationFilename); @@ -287,23 +300,25 @@ TEST_F(LocatedTriplesTest, locatedTriple) { // Triples in the index. IdTable triplesInIndex = makeIdTableFromVector({{1, 10, 10}, // Row 0 - {1, 15, 20}, // Row 1 - {1, 15, 30}, // Row 2 - {1, 20, 10}, // Row 3 - {1, 30, 20}, // Row 4 - {1, 30, 30}}); // Row 5 + {2, 10, 10}, // Row 1 + {2, 15, 20}, // Row 2 + {2, 15, 30}, // Row 3 + {2, 20, 10}, // Row 4 + {2, 30, 20}, // Row 5 + {2, 30, 30}, // Row 6 + {3, 10, 10}}); // Row 7 // Locate the following triples, some of which exist in the relation and some // of which do not, and which cover a variety of positons, including triples // that are larger than all existing triples. IdTable triplesToLocate = - makeIdTableFromVector({{1, 15, 20}, // Exists. - {1, 14, 20}, // Does not exist. - {1, 20, 10}, // Exists. - {1, 30, 20}, // Exists. - {1, 30, 30}, // Exists. - {1, 30, 31}, // Larger than all existing. - {1, 30, 32}}); // Larger than all existing. + makeIdTableFromVector({{2, 15, 20}, // Equals Row 2 + {2, 14, 20}, // Before Row 2 + {2, 20, 10}, // Equals Row 4 + {2, 30, 20}, // Equals Row 5 + {2, 30, 30}, // Equals Row 6 + {2, 30, 31}, // Before Row 7 + {9, 30, 32}}); // Larger than all. // Now test for multiple block sizes (16 bytes is the minimum). // testing::internal::CaptureStdout(); @@ -313,34 +328,40 @@ TEST_F(LocatedTriplesTest, locatedTriple) { // With block size 16, we have each triple in its own block. testWithGivenBlockSize( triplesInIndex, triplesToLocate, 16, - {{1, "{ LT(1 0 V:1 V:14 V:20 0) LT(1 0 V:1 V:15 V:20 1) }"}, - {3, "{ LT(3 0 V:1 V:20 V:10 1) }"}, - {4, "{ LT(4 0 V:1 V:30 V:20 1) }"}, - {5, "{ LT(5 0 V:1 V:30 V:30 1) }"}, - {6, "{ LT(6 x V:1 V:30 V:31 0) LT(6 x V:1 V:30 V:32 0) }"}}); - - // With block size 32, we have three blocks à two triples each. + {{2, "{ LT(2 0 V:2 V:14 V:20 0) LT(2 0 V:2 V:15 V:20 1) }"}, + {4, "{ LT(4 0 V:2 V:20 V:10 1) }"}, + {5, "{ LT(5 0 V:2 V:30 V:20 1) }"}, + {6, "{ LT(6 0 V:2 V:30 V:30 1) }"}, + {7, "{ LT(7 0 V:2 V:30 V:31 0) }"}, + {8, "{ LT(8 x V:9 V:30 V:32 0) }"}}); + + // With block size 32, we have five blocks (Block 0 = Row 0, Block 1 = Row + // 1+2, Block 2 = Row 3+4, Block 3 = Row 5+6, Block 4 = Row 7). Note that a + // relation that spans multiple blocks has these blocks on its own. testWithGivenBlockSize( triplesInIndex, triplesToLocate, 32, - {{0, "{ LT(0 1 V:1 V:14 V:20 0) LT(0 1 V:1 V:15 V:20 1) }"}, - {1, "{ LT(1 1 V:1 V:20 V:10 1) }"}, - {2, "{ LT(2 0 V:1 V:30 V:20 1) LT(2 1 V:1 V:30 V:30 1) }"}, - {3, "{ LT(3 x V:1 V:30 V:31 0) LT(3 x V:1 V:30 V:32 0) }"}}); - - // With block size 48, we have two blocks à three triples each. + {{1, "{ LT(1 1 V:2 V:14 V:20 0) LT(1 1 V:2 V:15 V:20 1) }"}, + {2, "{ LT(2 1 V:2 V:20 V:10 1) }"}, + {3, "{ LT(3 0 V:2 V:30 V:20 1) LT(3 1 V:2 V:30 V:30 1) }"}, + {4, "{ LT(4 0 V:2 V:30 V:31 0) }"}, + {5, "{ LT(5 x V:9 V:30 V:32 0) }"}}); + + // With block size 48, we have four blocks (Block 0 = Row 0, Block 1 = Row + // 1+2+3, Block 2 = Row 4+5+6, Block 3 = Row 7). testWithGivenBlockSize( triplesInIndex, triplesToLocate, 48, - {{0, "{ LT(0 1 V:1 V:14 V:20 0) LT(0 1 V:1 V:15 V:20 1) }"}, - {1, - "{ LT(1 0 V:1 V:20 V:10 1) LT(1 1 V:1 V:30 V:20 1)" - " LT(1 2 V:1 V:30 V:30 1) }"}, - {2, "{ LT(2 x V:1 V:30 V:31 0) LT(2 x V:1 V:30 V:32 0) }"}}); + {{1, "{ LT(1 1 V:2 V:14 V:20 0) LT(1 1 V:2 V:15 V:20 1) }"}, + {2, + "{ LT(2 0 V:2 V:20 V:10 1) LT(2 1 V:2 V:30 V:20 1)" + " LT(2 2 V:2 V:30 V:30 1) }"}, + {3, "{ LT(3 0 V:2 V:30 V:31 0) }"}, + {4, "{ LT(4 x V:9 V:30 V:32 0) }"}}); // With block size 100'000, we have one block. - testWithGivenBlockSize( - triplesInIndex, triplesToLocate, 100'000, - {{0, - "{ LT(0 1 V:1 V:14 V:20 0) LT(0 1 V:1 V:15 V:20 1) LT(0 3 V:1 V:20 " - "V:10 1) LT(0 4 V:1 V:30 V:20 1) LT(0 5 V:1 V:30 V:30 1) }"}, - {1, "{ LT(1 x V:1 V:30 V:31 0) LT(1 x V:1 V:30 V:32 0) }"}}); + testWithGivenBlockSize(triplesInIndex, triplesToLocate, 100'000, + {{0, + "{ LT(0 2 V:2 V:14 V:20 0) LT(0 2 V:2 V:15 V:20 1) " + "LT(0 4 V:2 V:20 V:10 1) LT(0 5 V:2 V:30 V:20 1) " + "LT(0 6 V:2 V:30 V:30 1) LT(0 7 V:2 V:30 V:31 0) }"}, + {1, "{ LT(1 x V:9 V:30 V:32 0) }"}}); } From 59aae8e983c1cb76c7ff66c0ad1f70715afb4e7a Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Tue, 13 Feb 2024 16:31:07 +0100 Subject: [PATCH 4/4] A few minor improvements --- src/global/IdTriple.h | 8 +------- src/index/IndexMetaData.h | 2 -- src/index/LocatedTriples.cpp | 4 ++-- src/index/LocatedTriples.h | 9 +++++---- test/LocatedTriplesTest.cpp | 3 ++- 5 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/global/IdTriple.h b/src/global/IdTriple.h index 0353b8c747..425fec2d10 100644 --- a/src/global/IdTriple.h +++ b/src/global/IdTriple.h @@ -1,4 +1,4 @@ -// Copyright 2023, University of Freiburg +// Copyright 2024, University of Freiburg // Chair of Algorithms and Data Structures // Authors: Hannah Bast @@ -10,9 +10,3 @@ // Should we have an own class for this? We need this at several places. using IdTriple = std::array; - -// Hash value for such triple. -template -H AbslHashValue(H h, const IdTriple& triple) { - return H::combine(std::move(h), triple[0], triple[1], triple[2]); -} diff --git a/src/index/IndexMetaData.h b/src/index/IndexMetaData.h index 9842faeb69..4460cc39ab 100644 --- a/src/index/IndexMetaData.h +++ b/src/index/IndexMetaData.h @@ -87,10 +87,8 @@ class IndexMetaData { // name and the variable name are terrible. // For each relation, its meta data. - public: MapType _data; - private: // For each compressed block, its meta data. BlocksType _blockData; diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index a4b05aa7db..2349b300b9 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -79,8 +79,8 @@ LocatedTriple LocatedTriple::locateTripleInPermutation( Id searchId = matchingBlock->firstTriple_.col0Id_ > id1 ? matchingBlock->firstTriple_.col0Id_ : id1; - const auto& it = meta._data.lower_bound(searchId); - AD_CORRECTNESS_CHECK(it != meta._data.end()); + const auto& it = meta.data().lower_bound(searchId); + AD_CORRECTNESS_CHECK(it != meta.data().end()); Id id = it.getId(); const auto& relationMetadata = meta.getMetaData(id); size_t offsetBegin = relationMetadata.offsetInBlock_; diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h index e2a9735dc4..920f6d246e 100644 --- a/src/index/LocatedTriples.h +++ b/src/index/LocatedTriples.h @@ -121,9 +121,10 @@ class LocatedTriplesPerBlock { // Add the given `locatedTriple` to the given `LocatedTriplesPerBlock`. // Return a handle to where it was added (`LocatedTriples` is a sorted set, // see above). We need this handle so that we can easily remove the - // `locatedTriple` again from the set in case we need to. + // `locatedTriple` from the set again in case we need to. // - // The `locatedTriple` must not already exist in `LocatedTriplesPerBlock`. + // Precondition: The `locatedTriple` must not already exist in + // `LocatedTriplesPerBlock`. LocatedTriples::iterator add(const LocatedTriple& locatedTriple) { LocatedTriples& locatedTriples = map_[locatedTriple.blockIndex]; auto [handle, wasInserted] = locatedTriples.emplace(locatedTriple); @@ -133,7 +134,7 @@ class LocatedTriplesPerBlock { return handle; }; - // Get the total number of `LocatedTriple` objects (for all blocks). + // Get the total number of `LocatedTriple`s (for all blocks). size_t numTriples() const { return numTriples_; } // Get the number of blocks with a non-empty set of located triples. @@ -180,7 +181,7 @@ std::ostream& operator<<(std::ostream& os, const IdTable& idTable); // 1. The position is defined by the index of a block in the permutation and the // index of a row within that block. // -// 2. If the triple in contained in the permutation, it is contained exactly +// 2. If the triple is contained in the permutation, it is contained exactly // once and so there is a well defined block and position in that block. // // 2. If there is a block, where the first triple is smaller and the last triple diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index 4365b60466..4753104186 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -47,6 +47,7 @@ TEST_F(LocatedTriplesTest, numTriplesInBlock) { ASSERT_EQ(locatedTriplesPerBlock.numBlocks(), 3); ASSERT_EQ(locatedTriplesPerBlock.numTriples(), 7); + // Shorthand for creating a pair of counts. auto P = [](size_t n1, size_t n2) -> std::pair { return {n1, n2}; }; @@ -201,7 +202,7 @@ TEST_F(LocatedTriplesTest, locatedTriple) { IndexImpl indexBuilder(testAllocator); indexBuilder.setOnDiskBase(testIndexBasename); indexBuilder.blocksizePermutationPerColumn() = blockSize; - // The + // The function `createPermutationPair` expects a generator. IndexImpl::BlocksOfTriples blocksOfTriples = [&triplesInIndex]() -> cppcoro::generator> { co_yield triplesInIndex.clone();