ad-freiburg · joka921 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025 · Feb 5, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -203,6 +203,11 @@ if (${USE_CPP_17_BACKPORTS})
     add_definitions("-DQLEVER_CPP_17 -DCPP_CXX_CONCEPTS=0")
 endif()
 
+set(VOCAB_UNCOMPRESSED_IN_MEMORY OFF CACHE BOOL "Store QLever's vocabulary uncompressed and completely in RAM")
+if (${VOCAB_UNCOMPRESSED_IN_MEMORY})
+    add_definitions("-D_QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY")
+endif ()
+
 # Enable the specification of additional linker flags manually from the commandline
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}")
 set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}")

diff --git a/src/engine/ExportQueryExecutionTrees.cpp b/src/engine/ExportQueryExecutionTrees.cpp
@@ -357,8 +357,15 @@ ExportQueryExecutionTrees::getLiteralOrIriFromVocabIndex(
     case Datatype::LocalVocabIndex:
       return localVocab.getWord(id.getLocalVocabIndex()).asLiteralOrIri();
     case Datatype::VocabIndex: {
-      auto entity = index.indexToString(id.getVocabIndex());
-      return LiteralOrIri::fromStringRepresentation(entity);
+      auto getEntity = [&index, id]() {
+        return index.indexToString(id.getVocabIndex());
+      };
+      // The type of entity might be `string_view` (If the vocabulary is stored
+      // uncompressed in RAM) or `string` (if it is on-disk, or compressed or
+      // both). The following code works and is efficient in all cases. In
+      // particular, the `std::string` constructor is compiled out because of
+      // RVO if `getEntity()` already returns a `string`.
+      return LiteralOrIri::fromStringRepresentation(std::string(getEntity()));
     }
     default:
       AD_FAIL();

diff --git a/src/global/Pattern.h b/src/global/Pattern.h
@@ -18,6 +18,7 @@
 #include "util/File.h"
 #include "util/Generator.h"
 #include "util/Iterators.h"
+#include "util/ResetWhenMoved.h"
 #include "util/Serializer/FileSerializer.h"
 #include "util/Serializer/SerializeVector.h"
 #include "util/TypeTraits.h"
@@ -39,8 +40,8 @@
   using ref = value_type&;
   using const_ref = const value_type&;
 
-  ref operator[](const size_t pos) { return _data[pos]; }
-  const_ref operator[](const size_t pos) const { return _data[pos]; }
+  ref operator[](const size_t pos) { return data_[pos]; }
+  const_ref operator[](const size_t pos) const { return data_[pos]; }
 
   using const_iterator = ad_utility::IteratorForAccessOperator<
       Pattern, ad_utility::AccessViaBracketOperator, ad_utility::IsConst::True>;
@@ -51,19 +52,19 @@
 
   bool operator==(const Pattern& other) const = default;
 
-  size_t size() const { return _data.size(); }
+  size_t size() const { return data_.size(); }
 
-  void push_back(value_type i) { _data.push_back(i); }
+  void push_back(value_type i) { data_.push_back(i); }
 
-  void clear() { _data.clear(); }
+  void clear() { data_.clear(); }
 
-  const_ref back() const { return _data.back(); }
-  ref back() { return _data.back(); }
-  bool empty() const { return _data.empty(); }
+  const_ref back() const { return data_.back(); }
+  ref back() { return data_.back(); }
+  bool empty() const { return data_.empty(); }
 
-  const value_type* data() const { return _data.data(); }
+  const value_type* data() const { return data_.data(); }
 
-  std::vector<value_type> _data;
+  std::vector<value_type> data_;
 };
 
 namespace detail {
@@ -113,19 +114,19 @@
     static_assert(
         ad_utility::SimilarTo<decltype(*(input.begin()->begin())), data_type>);
     // Also make room for the end offset of the last element.
-    _offsets.reserve(input.size() + 1);
+    offsets_.reserve(input.size() + 1);
     size_t dataSize = 0;
     for (const auto& element : input) {
-      _offsets.push_back(dataSize);
+      offsets_.push_back(dataSize);
       dataSize += element.size();
     }
     // The last offset is the offset right after the last element.
-    _offsets.push_back(dataSize);
+    offsets_.push_back(dataSize);
 
-    _data.reserve(dataSize);
+    data_.reserve(dataSize);
 
     for (const auto& el : input) {
-      _data.insert(_data.end(), el.begin(), el.end());
+      data_.insert(data_.end(), el.begin(), el.end());
     }
   }
 
@@ -137,9 +138,9 @@
   CompactVectorOfStrings(CompactVectorOfStrings&&) noexcept = default;
 
   // There is one more offset than the number of elements.
-  size_t size() const { return ready() ? _offsets.size() - 1 : 0; }
+  size_t size() const { return ready() ? offsets_.size() - 1 : 0; }
 
-  bool ready() const { return !_offsets.empty(); }
+  bool ready() const { return !offsets_.empty(); }
 
   /**
    * @brief operator []
@@ -148,9 +149,9 @@
    *         elements stored at the pointers target.
    */
   const value_type operator[](size_t i) const {
-    offset_type offset = _offsets[i];
-    const data_type* ptr = _data.data() + offset;
-    size_t size = _offsets[i + 1] - offset;
+    offset_type offset = offsets_[i];
+    const data_type* ptr = data_.data() + offset;
+    size_t size = offsets_[i + 1] - offset;
     return {ptr, size};
   }
 
@@ -169,80 +170,100 @@
 
   // Allow serialization via the ad_utility::serialization interface.
   AD_SERIALIZE_FRIEND_FUNCTION(CompactVectorOfStrings) {
-    serializer | arg._data;
-    serializer | arg._offsets;
+    serializer | arg.data_;
+    serializer | arg.offsets_;
   }
 
  private:
-  std::vector<data_type> _data;
-  std::vector<offset_type> _offsets;
+  std::vector<data_type> data_;
+  std::vector<offset_type> offsets_;
 };
 
 namespace detail {
 // Allows the incremental writing of a `CompactVectorOfStrings` directly to a
 // file.
 template <typename data_type>
 struct CompactStringVectorWriter {
-  ad_utility::File _file;
-  off_t _startOfFile;
+  ad_utility::File file_;
+  off_t startOfFile_;
   using offset_type = typename CompactVectorOfStrings<data_type>::offset_type;
-  std::vector<offset_type> _offsets;
-  bool _finished = false;
-  offset_type _nextOffset = 0;
+  std::vector<offset_type> offsets_;
+
+  // A `CompactStringVectorWriter` that has been moved from may not call
+  // `finish()` any more in its destructor.
+  ad_utility::ResetWhenMoved<bool, true> finished_ = false;
+  offset_type nextOffset_ = 0;
 
   explicit CompactStringVectorWriter(const std::string& filename)
-      : _file{filename, "w"} {
+      : file_{filename, "w"} {
     commonInitialization();
   }
 
   explicit CompactStringVectorWriter(ad_utility::File&& file)
-      : _file{std::move(file)} {
+      : file_{std::move(file)} {
     commonInitialization();
   }
 
   void push(const data_type* data, size_t elementSize) {
-    AD_CONTRACT_CHECK(!_finished);
-    _offsets.push_back(_nextOffset);
-    _nextOffset += elementSize;
-    _file.write(data, elementSize * sizeof(data_type));
+    AD_CONTRACT_CHECK(!finished_);
+    offsets_.push_back(nextOffset_);
+    nextOffset_ += elementSize;
+    file_.write(data, elementSize * sizeof(data_type));
   }
 
   // Finish writing, and return the moved file. If the return value is
   // discarded, then the file will be closed immediately by the destructor of
   // the `File` class.
   ad_utility::File finish() {
-    if (_finished) {
+    if (finished_) {
       return {};
     }
-    _finished = true;
-    _offsets.push_back(_nextOffset);
-    _file.seek(_startOfFile, SEEK_SET);
-    _file.write(&_nextOffset, sizeof(size_t));
-    _file.seek(0, SEEK_END);
-    ad_utility::serialization::FileWriteSerializer f{std::move(_file)};
-    f << _offsets;
+    finished_ = true;
+    offsets_.push_back(nextOffset_);
+    file_.seek(startOfFile_, SEEK_SET);
+    file_.write(&nextOffset_, sizeof(size_t));
+    file_.seek(0, SEEK_END);
+    ad_utility::serialization::FileWriteSerializer f{std::move(file_)};
+    f << offsets_;
     return std::move(f).file();
   }
 
   ~CompactStringVectorWriter() {
-    if (!_finished) {
+    if (!finished_) {
       ad_utility::terminateIfThrows(
           [this]() { finish(); },
           "Finishing the underlying File of a `CompactStringVectorWriter` "
           "during destruction failed");
     }
   }
 
+  // The copy operations would be deleted implicitly (because `File` is not
+  // copyable.
+  CompactStringVectorWriter(const CompactStringVectorWriter&) = delete;
+  CompactStringVectorWriter& operator=(const CompactStringVectorWriter&) =
+      delete;
+
+  // The move operations have to be explicitly defaulted, because we have a
+  // manually defined destructor.
+  // Note: The defaulted move operations behave correctly because of the usage
+  // of `ResetWhenMoved` with the `finished` member.
+  CompactStringVectorWriter(CompactStringVectorWriter&&) = default;
+  CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default;
+
  private:
   // Has to be run by all the constructors
   void commonInitialization() {
-    AD_CONTRACT_CHECK(_file.isOpen());
-    // We don't known the data size yet.
-    _startOfFile = _file.tell();
+    AD_CONTRACT_CHECK(file_.isOpen());
+    // We don't know the data size yet.
+    startOfFile_ = file_.tell();
     size_t dataSizeDummy = 0;
-    _file.write(&dataSizeDummy, sizeof(dataSizeDummy));
+    file_.write(&dataSizeDummy, sizeof(dataSizeDummy));
   }
 };
+static_assert(
+    std::is_nothrow_move_assignable_v<CompactStringVectorWriter<char>>);
+static_assert(
+    std::is_nothrow_move_constructible_v<CompactStringVectorWriter<char>>);
 }  // namespace detail
 
 // Forward iterator for a `CompactVectorOfStrings` that reads directly from
@@ -282,13 +303,11 @@
   }
 }
 
-namespace std {
 template <>
-struct hash<Pattern> {
-  std::size_t operator()(const Pattern& p) const {
+struct std::hash<Pattern> {
+  std::size_t operator()(const Pattern& p) const noexcept {
     std::string_view s = std::string_view(
-        reinterpret_cast<const char*>(p._data.data()), sizeof(Id) * p.size());
+        reinterpret_cast<const char*>(p.data_.data()), sizeof(Id) * p.size());
     return hash<std::string_view>()(s);
   }
 };
-}  // namespace std
diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(vocabulary)
 add_library(index
         Index.cpp IndexImpl.cpp IndexImpl.Text.cpp
-        Vocabulary.cpp VocabularyOnDisk.cpp
+        Vocabulary.cpp
         LocatedTriples.cpp Permutation.cpp TextMetaData.cpp
         DocsDB.cpp FTSAlgorithms.cpp
         PrefixHeuristic.cpp CompressedRelation.cpp

diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h
@@ -99,7 +99,8 @@ constinit inline std::atomic<size_t> BUFFER_SIZE_PARTIAL_TO_GLOBAL_ID_MAPPINGS =
 // the overhead of the metadata that has to be stored per block becomes
 // infeasible. 250K seems to be a reasonable tradeoff here.
 constexpr inline ad_utility::MemorySize
-    UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN = 250_kB;
+    UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN =
+        ad_utility::MemorySize::kilobytes(250);
 
 constexpr inline size_t NumColumnsIndexBuilding = 4;
 

diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -71,12 +71,13 @@ size_t Index::getCardinality(
 }
 
 // ____________________________________________________________________________
-std::string Index::indexToString(VocabIndex id) const {
+auto Index::indexToString(VocabIndex id) const -> Vocab::AccessReturnType {
   return pimpl_->indexToString(id);
 }
 
 // ____________________________________________________________________________
-std::string_view Index::indexToString(WordVocabIndex id) const {
+auto Index::indexToString(WordVocabIndex id) const
+    -> TextVocabulary::AccessReturnType {
   return pimpl_->indexToString(id);
 }
 

diff --git a/src/index/Index.h b/src/index/Index.h
@@ -104,13 +104,11 @@ class Index {
   // Read necessary metadata into memory and open file handles.
   void addTextFromOnDiskIndex();
 
-  using Vocab =
-      Vocabulary<CompressedString, TripleComponentComparator, VocabIndex>;
+  using Vocab = RdfsVocabulary;
   [[nodiscard]] const Vocab& getVocab() const;
   Vocab& getNonConstVocabForTesting();
 
-  using TextVocab =
-      Vocabulary<std::string, SimpleStringComparator, WordVocabIndex>;
+  using TextVocab = TextVocabulary;
   [[nodiscard]] const TextVocab& getTextVocab() const;
 
   // Get a (non-owning) pointer to the BlankNodeManager of this Index.
@@ -132,8 +130,8 @@ class Index {
 
   // TODO<joka921> Once we have an overview over the folding this logic should
   // probably not be in the index class.
-  std::string indexToString(VocabIndex id) const;
-  std::string_view indexToString(WordVocabIndex id) const;
+  Vocab::AccessReturnType indexToString(VocabIndex id) const;
+  TextVocab::AccessReturnType indexToString(WordVocabIndex id) const;
 
   [[nodiscard]] Vocab::PrefixRanges prefixRanges(std::string_view prefix) const;
 

diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp
@@ -11,6 +11,7 @@
 #include <string>
 
 #include "CompilationInfo.h"
+#include "IndexImpl.h"
 #include "global/Constants.h"
 #include "index/ConstantsIndexBuilding.h"
 #include "index/Index.h"
@@ -166,6 +167,7 @@ int main(int argc, char** argv) {
   bool addWordsFromLiterals = false;
   std::optional<ad_utility::MemorySize> stxxlMemory;
   std::optional<ad_utility::MemorySize> parserBufferSize;
+  std::optional<ad_utility::VocabularyType> vocabType;
   optind = 1;
 
   Index index{ad_utility::makeUnlimitedAllocator<Id>()};
@@ -224,6 +226,10 @@ int main(int argc, char** argv) {
   add("only-pso-and-pos-permutations,o", po::bool_switch(&onlyPsoAndPos),
       "Only build the PSO and POS permutations. This is faster, but then "
       "queries with predicate variables are not supported");
+  auto msg = absl::StrCat(
+      "The vocabulary implementation for strings in qlever, can be any of ",
+      ad_utility::VocabularyType::getListOfSupportedValues());
+  add("vocabulary-type", po::value(&vocabType), msg.c_str());
 
   // Options for the index building process.
   add("stxxl-memory,m", po::value(&stxxlMemory),
@@ -257,6 +263,10 @@ int main(int argc, char** argv) {
     index.parserBufferSize() = parserBufferSize.value();
   }
 
+  if (vocabType.has_value()) {
+    index.getImpl().setVocabularyTypeForIndexBuilding(vocabType.value());
+  }
+
   // If no text index name was specified, take the part of the wordsfile after
   // the last slash.
   if (textIndexName.empty() && !wordsfile.empty()) {

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
@@ -48,7 +48,7 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
       if (!isLiteral(text)) {
         continue;
       }
-      WordsFileLine entityLine{text, true, contextId, 1, true};
+      WordsFileLine entityLine{std::string{text}, true, contextId, 1, true};
       co_yield entityLine;
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));