ad-freiburg · joka921 · May 22, 2024 · May 21, 2024 · May 21, 2024 · May 22, 2024
diff --git a/src/global/IdTriple.h b/src/global/IdTriple.h
@@ -0,0 +1,12 @@
+// Copyright 2024, University of Freiburg
+// Chair of Algorithms and Data Structures
+// Authors: Hannah Bast <[email protected]>
+
+#pragma once
+
+#include <array>
+
+#include "global/Id.h"
+
+// Should we have an own class for this? We need this at several places.
+using IdTriple = std::array<Id, 3>;
diff --git a/src/global/ValueId.h b/src/global/ValueId.h
@@ -326,10 +326,15 @@ class ValueId {
   /// This operator is only for debugging and testing. It returns a
   /// human-readable representation.
   friend std::ostream& operator<<(std::ostream& ostr, const ValueId& id) {
-    ostr << toString(id.getDatatype()) << ':';
+    ostr << toString(id.getDatatype())[0] << ':';
+    if (id.getDatatype() == Datatype::Undefined) {
+      return ostr << id.getBits();
+    }
+
     auto visitor = [&ostr]<typename T>(T&& value) {
       if constexpr (ad_utility::isSimilar<T, ValueId::UndefinedType>) {
-        ostr << "Undefined";
+        // already handled above
+        AD_FAIL();
       } else if constexpr (ad_utility::isSimilar<T, double> ||
                            ad_utility::isSimilar<T, int64_t>) {
         ostr << std::to_string(value);

diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp
@@ -679,10 +679,12 @@ IndexImpl::createPermutations(size_t numColumns, auto&& sortedTriples,
 }
 
 // ________________________________________________________________________
-size_t IndexImpl::createPermutationPair(size_t numColumns, auto&& sortedTriples,
+template <typename SortedTriplesType, typename... CallbackTypes>
+size_t IndexImpl::createPermutationPair(size_t numColumns,
+                                        SortedTriplesType&& sortedTriples,
                                         const Permutation& p1,
                                         const Permutation& p2,
-                                        auto&&... perTripleCallbacks) {
+                                        CallbackTypes&&... perTripleCallbacks) {
   auto [numDistinctC0, metaData1, metaData2] = createPermutations(
       numColumns, AD_FWD(sortedTriples), p1, p2, AD_FWD(perTripleCallbacks)...);
   // Set the name of this newly created pair of `IndexMetaData` objects.

diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h
@@ -490,17 +490,20 @@ class IndexImpl {
   // OSP-OPS, SPO-SOP).  First creates the permutation and then exchanges the
   // multiplicities and also writes the MetaData to disk. So we end up with
   // fully functional permutations.
+  //
+  // TODO: The rest of this comment looks outdated.
+  //
   // performUnique must be set for the first pair created using vec to enforce
   // RDF standard (no duplicate triples).
   // createPatternsAfterFirst is only valid when  the pair is SPO-SOP because
   // the SPO permutation is also needed for patterns (see usage in
   // IndexImpl::createFromFile function)
 
-  [[nodiscard]] size_t createPermutationPair(size_t numColumns,
-                                             auto&& sortedTriples,
-                                             const Permutation& p1,
-                                             const Permutation& p2,
-                                             auto&&... perTripleCallbacks);
+  template <typename SortedTriplesType, typename... CallbackTypes>
+  [[nodiscard]] size_t createPermutationPair(
+      size_t numColumns, SortedTriplesType&& sortedTriples,
+      const Permutation& p1, const Permutation& p2,
+      CallbackTypes&&... perTripleCallbacks);
 
   // wrapper for createPermutation that saves a lot of code duplications
   // Writes the permutation that is specified by argument permutation

diff --git a/src/index/IndexMetaData.h b/src/index/IndexMetaData.h
@@ -1,6 +1,7 @@
 // Copyright 2015, University of Freiburg,
 // Chair of Algorithms and Data Structures.
 // Author: Björn Buchhold ([email protected])
+
 #pragma once
 
 #include <stdio.h>
@@ -13,14 +14,14 @@
 #include <utility>
 #include <vector>
 
-#include "../global/Id.h"
-#include "../util/File.h"
-#include "../util/HashMap.h"
-#include "../util/MmapVector.h"
-#include "../util/ReadableNumberFact.h"
-#include "../util/Serializer/Serializer.h"
-#include "./MetaDataHandler.h"
-#include "CompressedRelation.h"
+#include "global/Id.h"
+#include "index/CompressedRelation.h"
+#include "index/MetaDataHandler.h"
+#include "util/File.h"
+#include "util/HashMap.h"
+#include "util/MmapVector.h"
+#include "util/ReadableNumberFact.h"
+#include "util/Serializer/Serializer.h"
 
 using std::array;
 using std::pair;
@@ -218,7 +219,8 @@ template <class MapType>
 ad_utility::File& operator<<(ad_utility::File& f,
                              const IndexMetaData<MapType>& imd);
 
-// aliases for easier use in Index class
+// Aliases for easier use in classes that build or query permutations, like
+// `IndexImpl`.
 using MetaWrapperMmap =
     MetaDataWrapperDense<ad_utility::MmapVector<CompressedRelationMetadata>>;
 using MetaWrapperMmapView = MetaDataWrapperDense<

diff --git a/src/index/MetaDataHandler.h b/src/index/MetaDataHandler.h
@@ -1,24 +1,40 @@
-// Copyright 2018, University of Freiburg,
+// Copyright 2018 - 2023, University of Freiburg
 // Chair of Algorithms and Data Structures
-// Author: Johannes Kalmbach ([email protected])
-//
+// Authors: Johannes Kalmbach <[email protected]>
+//          Hannah Bast <[email protected]>
+
 #pragma once
 
 #include <cassert>
 #include <stxxl/vector>
 
-#include "../global/Id.h"
-#include "../util/Exception.h"
-#include "../util/HashMap.h"
-#include "../util/Iterators.h"
-#include "../util/Log.h"
-#include "../util/Serializer/Serializer.h"
-#include "./CompressedRelation.h"
-
-// _____________________________________________________________________
+#include "global/Id.h"
+#include "index/CompressedRelation.h"
+#include "util/Exception.h"
+#include "util/HashMap.h"
+#include "util/Iterators.h"
+#include "util/Log.h"
+#include "util/Serializer/Serializer.h"
+
+// Wrapper class for access to `CompressedRelationMetadata` objects (one per
+// relation) stored in a vector. Specifically, our index uses this with `M =
+// MmapVector<CompressedRelationMetadata>>`; see `index/IndexMetaData.h` at the
+// bottom.
+//
+// TODO: We needed this at some point because we used to have two implementation
+// of `IndexMetaData`, one using mmaps and one using hash maps, and we wanted to
+// have a common interface for both. We no longer use the hash map
+// implementation and so the wrapper class (and the complexity that goes along
+// with it) is probably no longer needed.
 template <class M>
 class MetaDataWrapperDense {
+ private:
+  // A vector of metadata objects.
+  M _vec;
+
  public:
+  // An iterator with an additional method `getId()` that gives the relation ID
+  // of the current metadata object.
   template <typename BaseIterator>
   struct AddGetIdIterator : BaseIterator {
     using BaseIterator::BaseIterator;
@@ -39,6 +55,7 @@ class MetaDataWrapperDense {
   // The underlying array is sorted, so all iterators are ordered iterators
   using ConstOrderedIterator = ConstIterator;
 
+  // The type of the stored metadata objects.
   using value_type = typename M::value_type;
 
   // _________________________________________________________
@@ -88,7 +105,7 @@ class MetaDataWrapperDense {
 
   // ____________________________________________________________
   void set(Id id, const value_type& value) {
-    // Assert that the ids are ascending.
+    // Check that the `Id`s are added in strictly ascending order.
     AD_CONTRACT_CHECK(_vec.size() == 0 || _vec.back().col0Id_ < id);
     _vec.push_back(value);
   }
@@ -116,5 +133,10 @@ class MetaDataWrapperDense {
     };
     return std::lower_bound(_vec.begin(), _vec.end(), id, cmp);
   }
-  M _vec;
+  Iterator lower_bound(Id id) {
+    auto cmp = [](const auto& metaData, Id id) {
+      return metaData.col0Id_ < id;
+    };
+    return std::lower_bound(_vec.begin(), _vec.end(), id, cmp);
+  }
 };
diff --git a/test/CompressedRelationsTest.cpp b/test/CompressedRelationsTest.cpp
@@ -507,5 +507,5 @@ TEST(CompressedRelationReader, PermutedTripleToString) {
   auto tr = CompressedBlockMetadata::PermutedTriple{V(12), V(13), V(27)};
   std::stringstream str;
   str << tr;
-  ASSERT_EQ(str.str(), "Triple: VocabIndex:12 VocabIndex:13 VocabIndex:27\n");
+  ASSERT_EQ(str.str(), "Triple: V:12 V:13 V:27\n");
 }
diff --git a/test/SparqlExpressionTypesTest.cpp b/test/SparqlExpressionTypesTest.cpp
@@ -34,7 +34,7 @@ TEST(SparqlExpressionTypes, printIdOrString) {
 
   IdOrLiteralOrIri idOrString{Id::makeUndefined()};
   PrintTo(idOrString, &str);
-  ASSERT_EQ(str.str(), "Undefined:Undefined");
+  ASSERT_EQ(str.str(), "U:0");
   idOrString = LiteralOrIri::literalWithoutQuotes("bimm");
   // Clear the stringstream.
   str.str({});

diff --git a/test/ValueIdTest.cpp b/test/ValueIdTest.cpp
@@ -289,21 +289,27 @@ TEST(ValueId, toDebugString) {
     stream << id;
     ASSERT_EQ(stream.str(), expected);
   };
-  test(ValueId::makeUndefined(), "Undefined:Undefined");
-  test(ValueId::makeFromInt(-42), "Int:-42");
-  test(ValueId::makeFromDouble(42.0), "Double:42.000000");
-  test(ValueId::makeFromBool(false), "Bool:false");
-  test(ValueId::makeFromBool(true), "Bool:true");
-  test(makeVocabId(15), "VocabIndex:15");
+  test(ValueId::makeUndefined(), "U:0");
+  // Values with type undefined can usually only have one value (all data bits
+  // zero). Sometimes ValueIds with type undefined but non-zero data bits are
+  // used. The following test tests one of these internal ValueIds.
+  ValueId customUndefined = ValueId::fromBits(
+      ValueId::IntegerType::fromNBit(100) |
+      (static_cast<ValueId::T>(Datatype::Undefined) << ValueId::numDataBits));
+  test(customUndefined, "U:100");
+  test(ValueId::makeFromDouble(42.0), "D:42.000000");
+  test(ValueId::makeFromBool(false), "B:false");
+  test(ValueId::makeFromBool(true), "B:true");
+  test(makeVocabId(15), "V:15");
   auto str = ad_utility::triple_component::LiteralOrIri::literalWithoutQuotes(
       "SomeValue");
-  test(ValueId::makeFromLocalVocabIndex(&str), "LocalVocabIndex:\"SomeValue\"");
-  test(makeTextRecordId(37), "TextRecordIndex:37");
-  test(makeWordVocabId(42), "WordVocabIndex:42");
-  test(makeBlankNodeId(27), "BlankNodeIndex:27");
+  test(ValueId::makeFromLocalVocabIndex(&str), "L:\"SomeValue\"");
+  test(makeTextRecordId(37), "T:37");
+  test(makeWordVocabId(42), "W:42");
+  test(makeBlankNodeId(27), "B:27");
   test(ValueId::makeFromDate(
            DateOrLargeYear{123456, DateOrLargeYear::Type::Year}),
-       "Date:123456");
+       "D:123456");
   // make an ID with an invalid datatype
   ASSERT_ANY_THROW(test(ValueId::max(), "blim"));
 }

diff --git a/test/ValuesForTestingTest.cpp b/test/ValuesForTestingTest.cpp
@@ -27,8 +27,7 @@ TEST(ValuesForTesting, valuesForTesting) {
 
   ASSERT_THAT(
       v.getCacheKey(),
-      ::testing::StartsWith(
-          "Values for testing with 2 columns. VocabIndex:3 VocabIndex:12"));
+      ::testing::StartsWith("Values for testing with 2 columns. V:3 V:12"));
   ASSERT_THAT(v.getCacheKey(), ::testing::EndsWith("Supports limit: 0"));
   ASSERT_EQ(v.getDescriptor(), "explicit values for testing");
   ASSERT_TRUE(v.resultSortedOn().empty());