ad-freiburg · hannahbast · Jun 9, 2023 · Jun 10, 2023 · Jun 10, 2023 · Feb 10, 2024
diff --git a/src/global/IdTriple.h b/src/global/IdTriple.h
@@ -0,0 +1,12 @@
+// Copyright 2024, University of Freiburg
+// Chair of Algorithms and Data Structures
+// Authors: Hannah Bast <[email protected]>
+
+#pragma once
+
+#include <array>
+
+#include "global/Id.h"
+
+// Should we have an own class for this? We need this at several places.
+using IdTriple = std::array<Id, 3>;
diff --git a/src/global/ValueId.h b/src/global/ValueId.h
@@ -311,10 +311,10 @@ class ValueId {
   /// This operator is only for debugging and testing. It returns a
   /// human-readable representation.
   friend std::ostream& operator<<(std::ostream& ostr, const ValueId& id) {
-    ostr << toString(id.getDatatype()) << ':';
+    ostr << toString(id.getDatatype())[0] << ':';
     auto visitor = [&ostr]<typename T>(T&& value) {
       if constexpr (ad_utility::isSimilar<T, ValueId::UndefinedType>) {
-        ostr << "Undefined";
+        ostr << "xx";
       } else if constexpr (ad_utility::isSimilar<T, double> ||
                            ad_utility::isSimilar<T, int64_t>) {
         ostr << std::to_string(value);

diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt
@@ -2,7 +2,7 @@ add_subdirectory(vocabulary)
 add_library(index
         Index.cpp IndexImpl.cpp IndexImpl.Text.cpp
         Vocabulary.cpp VocabularyOnDisk.cpp
-        Permutation.cpp TextMetaData.cpp
+        LocatedTriples.cpp Permutation.cpp TextMetaData.cpp
         DocsDB.cpp FTSAlgorithms.cpp
         PrefixHeuristic.cpp CompressedRelation.cpp
         PatternCreator.cpp)

diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp
@@ -1073,7 +1073,9 @@ CompressedRelationWriter::createPermutationPair(
   }
   inputWaitTimer.cont();
   for (auto& block : AD_FWD(sortedTriples)) {
-    AD_CORRECTNESS_CHECK(block.numColumns() == numColumns + 1);
+    AD_CORRECTNESS_CHECK(block.numColumns() == numColumns + 1,
+                         "block.numColumns() = ", block.numColumns(),
+                         ", numColumns = ", numColumns);
     inputWaitTimer.stop();
     // This only happens when the index is completely empty.
     if (block.empty()) {

diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h
@@ -510,6 +510,7 @@ class CompressedRelationReader {
   static void decompressColumn(const std::vector<char>& compressedColumn,
                                size_t numRowsToRead, Iterator iterator);
 
+ public:
   // Read the block that is identified by the `blockMetaData` from the `file`,
   // decompress and return it. Only the columns specified by the `columnIndices`
   // are returned.

diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp
@@ -740,10 +740,12 @@ IndexImpl::createPermutations(size_t numColumns, auto&& sortedTriples,
 }
 
 // ________________________________________________________________________
-void IndexImpl::createPermutationPair(size_t numColumns, auto&& sortedTriples,
+template <typename SortedTriplesType, typename... CallbackTypes>
+void IndexImpl::createPermutationPair(size_t numColumns,
+                                      SortedTriplesType&& sortedTriples,
                                       const Permutation& p1,
                                       const Permutation& p2,
-                                      auto&&... perTripleCallbacks) {
+                                      CallbackTypes&&... perTripleCallbacks) {
   auto [metaData1, metaData2] = createPermutations(
       numColumns, AD_FWD(sortedTriples), p1, p2, AD_FWD(perTripleCallbacks)...);
   // Set the name of this newly created pair of `IndexMetaData` objects.
@@ -762,6 +764,15 @@ void IndexImpl::createPermutationPair(size_t numColumns, auto&& sortedTriples,
   writeMetadata(metaData2, p2);
 }
 
+// Explicit instantiation needed for `test/LocatedTripleTest.cpp`.
+//
+// TODO: Do we really need to make `SortedTriplesType` a template parameter (or
+// `auto&&` as it was before)? To me it looks like this function (and others of
+// its kind) are always called with `sortedTriples` of type `BlocksOfTriples`.
+template void IndexImpl::createPermutationPair<IndexImpl::BlocksOfTriples>(
+    size_t, IndexImpl::BlocksOfTriples&&, const Permutation&,
+    const Permutation&);
+
 // _____________________________________________________________________________
 void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) {
   setOnDiskBase(onDiskBase);

diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h
@@ -492,30 +492,38 @@ class IndexImpl {
                             std::array<size_t, 3> permutation,
                             auto&&... perTripleCallbacks);
 
-  // _______________________________________________________________________
   // Create a pair of permutations. Only works for valid pairs (PSO-POS,
   // OSP-OPS, SPO-SOP).  First creates the permutation and then exchanges the
   // multiplicities and also writes the MetaData to disk. So we end up with
   // fully functional permutations.
+  //
+  // TODO: The rest of this comment looks outdated.
+  //
   // performUnique must be set for the first pair created using vec to enforce
   // RDF standard (no duplicate triples).
   // createPatternsAfterFirst is only valid when  the pair is SPO-SOP because
   // the SPO permutation is also needed for patterns (see usage in
   // IndexImpl::createFromFile function)
-
-  void createPermutationPair(size_t numColumns, auto&& sortedTriples,
+ public:
+  template <typename SortedTriplesType, typename... CallbackTypes>
+  void createPermutationPair(size_t numColumns,
+                             SortedTriplesType&& sortedTriples,
                              const Permutation& p1, const Permutation& p2,
-                             auto&&... perTripleCallbacks);
+                             CallbackTypes&&... perTripleCallbacks);
+  // void createPermutationPair(size_t numColumns, auto&& sortedTriples,
+  //                            const Permutation& p1, const Permutation& p2,
+  //                            auto&&... perTripleCallbacks);
 
+ private:
   // wrapper for createPermutation that saves a lot of code duplications
   // Writes the permutation that is specified by argument permutation
   // performs std::unique on arg vec iff arg performUnique is true (normally
   // done for first permutation that is created using vec).
   // Will sort vec.
   // returns the MetaData (MmapBased or HmapBased) for this relation.
-  // Careful: only multiplicities for first column is valid after call, need to
-  // call exchangeMultiplicities as done by createPermutationPair
-  // the optional is std::nullopt if vec and thus the index is empty
+  // Careful: only multiplicities for first column is valid after call, need
+  // to call exchangeMultiplicities as done by createPermutationPair the
+  // optional is std::nullopt if vec and thus the index is empty
   std::pair<IndexMetaDataMmapDispatcher::WriteType,
             IndexMetaDataMmapDispatcher::WriteType>
   createPermutations(size_t numColumns, auto&& sortedTriples,
@@ -545,12 +553,13 @@ class IndexImpl {
       MakeFromUint64t makeFromUint = MakeFromUint64t{}) const;
 
   // Get the metadata for the block from the text index that contains the
-  // `word`. Also works for prefixes that are terminated with `PREFIX_CHAR` like
-  // "astro*". Returns `nullopt` if no suitable block was found because no
-  // matching word is contained in the text index. Some additional information
-  // is also returned that is often required by the calling functions:
-  // `hasToBeFiltered_` is true iff `word` is NOT the only word in the text
-  // block, and additional filtering is thus required. `idRange_` is the range
+  // `word`. Also works for prefixes that are terminated with `PREFIX_CHAR`
+  // like "astro*". Returns `nullopt` if no suitable block was found because
+  // no matching word is contained in the text index. Some additional
+  // information is also returned that is often required by the calling
+  // functions: `hasToBeFiltered_` is true iff `word` is NOT the only word in
+  // the text block, and additional filtering is thus required. `idRange_` is
+  // the range
   // `[first, last]` of the `WordVocabIndex`es that correspond to the word
   // (which might also be a prefix, thus it is a range).
   struct TextBlockMetadataAndWordInfo {
@@ -612,8 +621,8 @@ class IndexImpl {
 
  private:
   /**
-   * @brief Throws an exception if no patterns are loaded. Should be called from
-   *        whithin any index method that returns data requiring the patterns
+   * @brief Throws an exception if no patterns are loaded. Should be called
+   * from whithin any index method that returns data requiring the patterns
    *        file.
    */
   void throwExceptionIfNoPatterns() const;
@@ -632,25 +641,26 @@ class IndexImpl {
 
  public:
   // Count the number of "QLever-internal" triples (predicate ql:langtag or
-  // predicate starts with @) and all other triples (that were actually part of
-  // the input).
+  // predicate starts with @) and all other triples (that were actually part
+  // of the input).
   NumNormalAndInternal numTriples() const;
 
   // The index contains several triples that are not part of the "actual"
-  // knowledge graph, but are added by QLever for internal reasons (e.g. for an
-  // efficient implementation of language filters). For a given
+  // knowledge graph, but are added by QLever for internal reasons (e.g. for
+  // an efficient implementation of language filters). For a given
   // `Permutation::Enum`, returns the following `std::pair`:
   //
   // first:  A `vector<pair<Id, Id>>` that denotes ranges in the first column
   //         of the permutation that imply that a triple is added. For example
-  //         in the `SPO` and `SOP` permutation a literal subject means that the
-  //         triple was added (literals are not legal subjects in RDF), so the
-  //         pair `(idOfFirstLiteral, idOfLastLiteral + 1)` will be contained
-  //         in the vector.
+  //         in the `SPO` and `SOP` permutation a literal subject means that
+  //         the triple was added (literals are not legal subjects in RDF), so
+  //         the pair `(idOfFirstLiteral, idOfLastLiteral + 1)` will be
+  //         contained in the vector.
   // second: A lambda that checks for a triple *that is not already excluded
   //         by the ignored ranges from the first argument* whether it still
-  //         is an added triple. For example in the `Sxx` and `Oxx` permutation
-  //         a triple where the predicate starts with '@' (instead of the usual
+  //         is an added triple. For example in the `Sxx` and `Oxx`
+  //         permutation a triple where the predicate starts with '@' (instead
+  //         of the usual
   //         '<' is an added triple from the language filter implementation.
   //
   // Note: A triple from a given permutation is an added triple if and only if
@@ -698,13 +708,13 @@ class IndexImpl {
     auto isTripleIgnored = [permutation,
                             isInternalPredicateId](const auto& triple) {
       // TODO<joka921, everybody in the future>:
-      // A lot of code (especially for statistical queries in `GroupBy.cpp` and
-      // the pattern trick) relies on this function being a noop for the `PSO`
-      // and `POS` permutations, meaning that it suffices to check the
-      // `ignoredRanges` for them. Should this ever change (which means that we
-      // add internal triples that use predicates that are actually contained in
-      // the knowledge graph), then all the code that uses this function has to
-      // be thoroughly reviewed.
+      // A lot of code (especially for statistical queries in `GroupBy.cpp`
+      // and the pattern trick) relies on this function being a noop for the
+      // `PSO` and `POS` permutations, meaning that it suffices to check the
+      // `ignoredRanges` for them. Should this ever change (which means that
+      // we add internal triples that use predicates that are actually
+      // contained in the knowledge graph), then all the code that uses this
+      // function has to be thoroughly reviewed.
       if (permutation == SPO || permutation == OPS) {
         // Predicates are always entities from the vocabulary.
         return isInternalPredicateId(triple[1]);
@@ -718,12 +728,13 @@ class IndexImpl {
   }
   using BlocksOfTriples = cppcoro::generator<IdTableStatic<0>>;
 
-  // Functions to create the pairs of permutations during the index build. Each
-  // of them takes the following arguments:
+  // Functions to create the pairs of permutations during the index build.
+  // Each of them takes the following arguments:
   // * `isQleverInternalId` a callable that takes an `Id` and returns true iff
-  //    the corresponding IRI was internally added by QLever and not part of the
-  //    knowledge graph.
-  // * `sortedInput`  The input, must be sorted by the first permutation in the
+  //    the corresponding IRI was internally added by QLever and not part of
+  //    the knowledge graph.
+  // * `sortedInput`  The input, must be sorted by the first permutation in
+  // the
   //    function name.
   // * `nextSorter` A callback that is invoked for each row in each of the
   //    blocks in the input. Typically used to set up the sorting for the
@@ -746,17 +757,17 @@ class IndexImpl {
                        NextSorter&&... nextSorter);
 
   // Create the PSO and POS permutations. Additionally, count the number of
-  // distinct predicates and the number of actual triples and write them to the
-  // metadata.
+  // distinct predicates and the number of actual triples and write them to
+  // the metadata.
   template <typename... NextSorter>
   requires(sizeof...(NextSorter) <= 1)
   void createPSOAndPOS(size_t numColumns, auto& isInternalId,
                        BlocksOfTriples sortedTriples,
                        NextSorter&&... nextSorter);
 
   // Set up one of the permutation sorters with the appropriate memory limit.
-  // The `permutationName` is used to determine the filename and must be unique
-  // for each call during one index build.
+  // The `permutationName` is used to determine the filename and must be
+  // unique for each call during one index build.
   template <typename Comparator, size_t N = NumColumnsIndexBuilding>
   ExternalSorter<Comparator, N> makeSorter(
       std::string_view permutationName) const;
@@ -773,10 +784,10 @@ class IndexImpl {
   // function names are consistent with the aliases for the sorters, i.e. that
   // `createFirstPermutationPair` corresponds to the `FirstPermutation`.
 
-  // The `createFirstPermutationPair` has a special implementation for the case
-  // of only two permutations (where we have to build the Pxx permutations). In
-  // all other cases the Sxx permutations are built first because we need the
-  // patterns.
+  // The `createFirstPermutationPair` has a special implementation for the
+  // case of only two permutations (where we have to build the Pxx
+  // permutations). In all other cases the Sxx permutations are built first
+  // because we need the patterns.
   std::optional<PatternCreator::TripleSorter> createFirstPermutationPair(
       auto&&... args) {
     static_assert(std::is_same_v<FirstPermutation, SortBySPO>);
@@ -799,12 +810,12 @@ class IndexImpl {
     return createPSOAndPOS(AD_FWD(args)...);
   }
 
-  // Build the OSP and OPS permutations from the output of the `PatternCreator`.
-  // The permutations will have two additional columns: The subject pattern of
-  // the subject (which is already created by the `PatternCreator`) and the
-  // subject pattern of the object (which is created by this function). Return
-  // these five columns sorted by PSO, to be used as an input for building the
-  // PSO and POS permutations.
+  // Build the OSP and OPS permutations from the output of the
+  // `PatternCreator`. The permutations will have two additional columns: The
+  // subject pattern of the subject (which is already created by the
+  // `PatternCreator`) and the subject pattern of the object (which is created
+  // by this function). Return these five columns sorted by PSO, to be used as
+  // an input for building the PSO and POS permutations.
   std::unique_ptr<ExternalSorter<SortByPSO, 5>> buildOspWithPatterns(
       PatternCreator::TripleSorter sortersFromPatternCreator,
       auto isQLeverInternalId);

diff --git a/src/index/IndexMetaData.h b/src/index/IndexMetaData.h
@@ -1,6 +1,7 @@
 // Copyright 2015, University of Freiburg,
 // Chair of Algorithms and Data Structures.
 // Author: Björn Buchhold ([email protected])
+
 #pragma once
 
 #include <stdio.h>
@@ -13,14 +14,14 @@
 #include <utility>
 #include <vector>
 
-#include "../global/Id.h"
-#include "../util/File.h"
-#include "../util/HashMap.h"
-#include "../util/MmapVector.h"
-#include "../util/ReadableNumberFact.h"
-#include "../util/Serializer/Serializer.h"
-#include "./MetaDataHandler.h"
-#include "CompressedRelation.h"
+#include "global/Id.h"
+#include "index/CompressedRelation.h"
+#include "index/MetaDataHandler.h"
+#include "util/File.h"
+#include "util/HashMap.h"
+#include "util/MmapVector.h"
+#include "util/ReadableNumberFact.h"
+#include "util/Serializer/Serializer.h"
 
 using std::array;
 using std::pair;
@@ -87,6 +88,7 @@ class IndexMetaData {
 
   // For each relation, its meta data.
   MapType _data;
+
   // For each compressed block, its meta data.
   BlocksType _blockData;
 
@@ -222,7 +224,8 @@ template <class MapType>
 ad_utility::File& operator<<(ad_utility::File& f,
                              const IndexMetaData<MapType>& imd);
 
-// aliases for easier use in Index class
+// Aliases for easier use in classes that build or query permutations, like
+// `IndexImpl`.
 using MetaWrapperMmap =
     MetaDataWrapperDense<ad_utility::MmapVector<CompressedRelationMetadata>>;
 using MetaWrapperMmapView = MetaDataWrapperDense<