From e9e8dfd18978a96bf5d61926e828200b18d4a2ff Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 31 Jan 2025 09:03:12 +0100 Subject: [PATCH 01/13] Make the In-Memory-Vocabulary compatible with the RDFVocabulary Signed-off-by: Johannes Kalmbach --- src/index/IndexImpl.Text.cpp | 2 +- src/index/IndexImpl.cpp | 7 +++++-- src/index/IndexImpl.h | 2 +- src/index/StringSortComparator.h | 7 +++++++ src/index/Vocabulary.cpp | 2 +- src/index/Vocabulary.h | 7 ++++++- src/index/vocabulary/VocabularyInMemory.h | 5 ++++- 7 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 3b872eb39c..2f15be7e5c 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -48,7 +48,7 @@ cppcoro::generator IndexImpl::wordsInTextRecords( if (!isLiteral(text)) { continue; } - WordsFileLine entityLine{text, true, contextId, 1, true}; + WordsFileLine entityLine{std::string{text}, true, contextId, 1, true}; co_yield entityLine; std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 72efec5307..6205d08f6b 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -974,7 +974,7 @@ size_t IndexImpl::getNumDistinctSubjectPredicatePairs() const { } // _____________________________________________________________________________ -bool IndexImpl::isLiteral(const string& object) const { +bool IndexImpl::isLiteral(std::string_view object) const { return decltype(vocab_)::stringIsLiteral(object); } @@ -1522,7 +1522,10 @@ size_t IndexImpl::getCardinality( } // ___________________________________________________________________________ -std::string IndexImpl::indexToString(VocabIndex id) const { return vocab_[id]; } +// TODO Make this the return type of the vocabulary. +std::string IndexImpl::indexToString(VocabIndex id) const { + return std::string{vocab_[id]}; +} // ___________________________________________________________________________ std::string_view IndexImpl::indexToString(WordVocabIndex id) const { diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index d284cdb415..a698a96c6f 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -634,7 +634,7 @@ class IndexImpl { friend class IndexTest_createFromOnDiskIndexTest_Test; friend class CreatePatternsFixture_createPatterns_Test; - bool isLiteral(const string& object) const; + bool isLiteral(std::string_view object) const; public: LangtagAndTriple tripleToInternalRepresentation(TurtleTriple&& triple) const; diff --git a/src/index/StringSortComparator.h b/src/index/StringSortComparator.h index 81829f226e..33f1f2077c 100644 --- a/src/index/StringSortComparator.h +++ b/src/index/StringSortComparator.h @@ -619,6 +619,13 @@ class TripleComponentComparator { return compare(spA, spB, level) < 0; } + // TODO Unify these three functions. + bool operator()(const SplitVal& spA, std::string_view b, + const Level level) const { + auto spB = extractAndTransformComparable(b, level, false); + return compare(spA, spB, level) < 0; + } + template bool operator()(const SplitValBase& a, const SplitValBase& b, const Level level) const { diff --git a/src/index/Vocabulary.cpp b/src/index/Vocabulary.cpp index ab2cb52505..cd3b25b490 100644 --- a/src/index/Vocabulary.cpp +++ b/src/index/Vocabulary.cpp @@ -43,7 +43,7 @@ void Vocabulary::readFromFile(const string& fileName) { << std::endl; vocabulary_.close(); vocabulary_.open(fileName); - if constexpr (isCompressed_) { + if constexpr (isCompressed_ && false) { const auto& internalExternalVocab = vocabulary_.getUnderlyingVocabulary().getUnderlyingVocabulary(); LOG(INFO) << "Done, number of words: " diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index 6775a13217..fc9c118b87 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -37,9 +37,11 @@ using std::string; using std::vector; template -using AccessReturnType_t = +using AccessReturnType_t = std::string_view; +/* std::conditional_t, std::string, std::string_view>; + */ template class IdRange { @@ -114,10 +116,13 @@ class Vocabulary { vector internalizedLangs_; vector externalizedPrefixes_{""}; + using UnderlyingVocabulary = VocabularyInMemory; + /* using UnderlyingVocabulary = std::conditional_t, VocabularyInMemory>; + */ using VocabularyWithUnicodeComparator = UnicodeVocabulary; diff --git a/src/index/vocabulary/VocabularyInMemory.h b/src/index/vocabulary/VocabularyInMemory.h index 5ce18fe721..efe9a9c7e7 100644 --- a/src/index/vocabulary/VocabularyInMemory.h +++ b/src/index/vocabulary/VocabularyInMemory.h @@ -68,11 +68,14 @@ class VocabularyInMemory struct WordWriter { typename Words::Writer writer_; explicit WordWriter(const std::string& filename) : writer_{filename} {} - void operator()(std::string_view str) { + void operator()(std::string_view str, + [[maybe_unused]] bool isExternalDummy = false) { writer_.push(str.data(), str.size()); } void finish() { writer_.finish(); } + std::string readableNameDummy_; + std::string& readableName() { return readableNameDummy_; } }; // Return a `WordWriter` that directly writes the words to the given From 79a11b662ad4a0e93db03b5d32512a689afef90d Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 31 Jan 2025 11:22:20 +0100 Subject: [PATCH 02/13] Refactor things. TODO: Make the vocabulary implementation be choosable from CMake Signed-off-by: Johannes Kalmbach --- src/engine/ExportQueryExecutionTrees.cpp | 11 +++- src/index/Index.cpp | 5 +- src/index/Index.h | 10 ++-- src/index/IndexImpl.cpp | 8 +-- src/index/IndexImpl.h | 4 +- src/index/StringSortComparator.h | 1 - src/index/Vocabulary.cpp | 42 ++++---------- src/index/Vocabulary.h | 57 ++++++++----------- src/index/vocabulary/VocabularyInMemory.cpp | 3 + src/index/vocabulary/VocabularyInMemory.h | 6 ++ .../vocabulary/VocabularyInternalExternal.cpp | 12 ++++ .../vocabulary/VocabularyInternalExternal.h | 5 +- test/engine/TextIndexScanTestHelpers.h | 8 +-- 13 files changed, 85 insertions(+), 87 deletions(-) diff --git a/src/engine/ExportQueryExecutionTrees.cpp b/src/engine/ExportQueryExecutionTrees.cpp index 3375e82924..351dc9b28d 100644 --- a/src/engine/ExportQueryExecutionTrees.cpp +++ b/src/engine/ExportQueryExecutionTrees.cpp @@ -356,8 +356,15 @@ ExportQueryExecutionTrees::getLiteralOrIriFromVocabIndex( case Datatype::LocalVocabIndex: return localVocab.getWord(id.getLocalVocabIndex()).asLiteralOrIri(); case Datatype::VocabIndex: { - auto entity = index.indexToString(id.getVocabIndex()); - return LiteralOrIri::fromStringRepresentation(entity); + auto getEntity = [&index, id]() { + return index.indexToString(id.getVocabIndex()); + }; + // The type of entity might be `string_view` (If the vocabulary is stored + // uncompressed in RAM) or `string` (if it is on-disk, or compressed or + // both). The following code works and is efficient in all cases. In + // particular, the `std::string` constructor is compiled out because of + // RVO if `getEntity()` already returns a `string`. + return LiteralOrIri::fromStringRepresentation(std::string(getEntity())); } default: AD_FAIL(); diff --git a/src/index/Index.cpp b/src/index/Index.cpp index f66914bfca..06350e1e26 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -71,12 +71,13 @@ size_t Index::getCardinality( } // ____________________________________________________________________________ -std::string Index::indexToString(VocabIndex id) const { +auto Index::indexToString(VocabIndex id) const -> Vocab::AccessReturnType { return pimpl_->indexToString(id); } // ____________________________________________________________________________ -std::string_view Index::indexToString(WordVocabIndex id) const { +auto Index::indexToString(WordVocabIndex id) const + -> TextVocabulary::AccessReturnType { return pimpl_->indexToString(id); } diff --git a/src/index/Index.h b/src/index/Index.h index 8c6dd1cd40..101908ab7e 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -104,13 +104,11 @@ class Index { // Read necessary metadata into memory and open file handles. void addTextFromOnDiskIndex(); - using Vocab = - Vocabulary; + using Vocab = RdfsVocabulary; [[nodiscard]] const Vocab& getVocab() const; Vocab& getNonConstVocabForTesting(); - using TextVocab = - Vocabulary; + using TextVocab = TextVocabulary; [[nodiscard]] const TextVocab& getTextVocab() const; // Get a (non-owning) pointer to the BlankNodeManager of this Index. @@ -132,8 +130,8 @@ class Index { // TODO Once we have an overview over the folding this logic should // probably not be in the index class. - std::string indexToString(VocabIndex id) const; - std::string_view indexToString(WordVocabIndex id) const; + Vocab::AccessReturnType indexToString(VocabIndex id) const; + TextVocab::AccessReturnType indexToString(WordVocabIndex id) const; [[nodiscard]] Vocab::PrefixRanges prefixRanges(std::string_view prefix) const; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 6205d08f6b..d5781bb297 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1522,13 +1522,13 @@ size_t IndexImpl::getCardinality( } // ___________________________________________________________________________ -// TODO Make this the return type of the vocabulary. -std::string IndexImpl::indexToString(VocabIndex id) const { - return std::string{vocab_[id]}; +RdfsVocabulary::AccessReturnType IndexImpl::indexToString(VocabIndex id) const { + return vocab_[id]; } // ___________________________________________________________________________ -std::string_view IndexImpl::indexToString(WordVocabIndex id) const { +TextVocabulary::AccessReturnType IndexImpl::indexToString( + WordVocabIndex id) const { return textVocab_[id]; } diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index a698a96c6f..8478943c92 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -305,10 +305,10 @@ class IndexImpl { const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; // ___________________________________________________________________________ - std::string indexToString(VocabIndex id) const; + RdfsVocabulary::AccessReturnType indexToString(VocabIndex id) const; // ___________________________________________________________________________ - std::string_view indexToString(WordVocabIndex id) const; + TextVocabulary::AccessReturnType indexToString(WordVocabIndex id) const; public: // ___________________________________________________________________________ diff --git a/src/index/StringSortComparator.h b/src/index/StringSortComparator.h index 33f1f2077c..d77e616a76 100644 --- a/src/index/StringSortComparator.h +++ b/src/index/StringSortComparator.h @@ -619,7 +619,6 @@ class TripleComponentComparator { return compare(spA, spB, level) < 0; } - // TODO Unify these three functions. bool operator()(const SplitVal& spA, std::string_view b, const Level level) const { auto spB = extractAndTransformComparable(b, level, false); diff --git a/src/index/Vocabulary.cpp b/src/index/Vocabulary.cpp index cd3b25b490..70e9f0c50e 100644 --- a/src/index/Vocabulary.cpp +++ b/src/index/Vocabulary.cpp @@ -39,20 +39,8 @@ bool Vocabulary::PrefixRanges::contain( // _____________________________________________________________________________ template void Vocabulary::readFromFile(const string& fileName) { - LOG(INFO) << "Reading vocabulary from file " << fileName << " ..." - << std::endl; vocabulary_.close(); vocabulary_.open(fileName); - if constexpr (isCompressed_ && false) { - const auto& internalExternalVocab = - vocabulary_.getUnderlyingVocabulary().getUnderlyingVocabulary(); - LOG(INFO) << "Done, number of words: " - << internalExternalVocab.internalVocab().size() << std::endl; - LOG(INFO) << "Number of words in external vocabulary: " - << internalExternalVocab.externalVocab().size() << std::endl; - } else { - LOG(INFO) << "Done, number of words: " << vocabulary_.size() << std::endl; - } // Precomputing ranges for IRIs, blank nodes, and literals, for faster // processing of the `isIrI` and `isLiteral` functions. @@ -88,19 +76,12 @@ bool Vocabulary::stringIsLiteral(std::string_view s) { // _____________________________________________________________________________ template bool Vocabulary::shouldBeExternalized(string_view s) const { - // TODO Completely refactor the Vocabulary on the different - // Types, it is a mess. - - // If the string is not compressed, this means that this is a text vocabulary - // and thus doesn't support externalization. - if constexpr (std::is_same_v) { - if (!stringIsLiteral(s)) { - return shouldEntityBeExternalized(s); - } else { - return shouldLiteralBeExternalized(s); - } + // TODO We should have a completely separate layer that handles the + // externalization, not the Vocab. + if (!stringIsLiteral(s)) { + return shouldEntityBeExternalized(s); } else { - return false; + return shouldLiteralBeExternalized(s); } } @@ -264,17 +245,18 @@ auto Vocabulary::prefixRanges(std::string_view prefix) const } // _____________________________________________________________________________ -template -auto Vocabulary::operator[](IndexType idx) const - -> AccessReturnType_t { +template +auto Vocabulary::operator[](IndexType idx) const + -> AccessReturnType { AD_CONTRACT_CHECK(idx.get() < size()); return vocabulary_[idx.get()]; } // Explicit template instantiations -template class Vocabulary; -template class Vocabulary; +template class Vocabulary; +template class Vocabulary; template void RdfsVocabulary::initializeInternalizedLangs( const nlohmann::json&); diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index fc9c118b87..c7a8454a4a 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -36,13 +36,6 @@ using std::string; using std::vector; -template -using AccessReturnType_t = std::string_view; -/* - std::conditional_t, - std::string, std::string_view>; - */ - template class IdRange { public: @@ -69,9 +62,15 @@ inline std::ostream& operator<<(std::ostream& stream, // retrieval. Template parameters that are supported are: // std::string -> no compression is applied // CompressedString -> prefix compression is applied -template +template class Vocabulary { public: + // The type that is returned by the `operator[]` of this vocabulary. Typically + // either `std::string` or `std::string_view`. + using AccessReturnType = + decltype(std::declval()[0]); + // The index ranges for a prefix + a function to check whether a given index // is contained in one of them. // @@ -96,17 +95,6 @@ class Vocabulary { // The different type of data that is stored in the vocabulary enum class Datatypes { Literal, Iri, Float, Date }; - template - using enable_if_compressed = - std::enable_if_t>; - - template - using enable_if_uncompressed = - std::enable_if_t>; - - static constexpr bool isCompressed_ = - std::is_same_v; - // If a literal uses one of these language tags or starts with one of these // prefixes, it will be externalized. By default, everything is externalized. // Both of these settings can be overridden using the `settings.json` file. @@ -116,13 +104,19 @@ class Vocabulary { vector internalizedLangs_; vector externalizedPrefixes_{""}; - using UnderlyingVocabulary = VocabularyInMemory; + // using UnderlyingVocabulary = VocabularyInMemory; /* using UnderlyingVocabulary = std::conditional_t, VocabularyInMemory>; */ + /* + using UnderlyingVocabulary = + std::conditional_t, + VocabularyInMemory>; + */ using VocabularyWithUnicodeComparator = UnicodeVocabulary; @@ -137,10 +131,7 @@ class Vocabulary { using SortLevel = typename ComparatorType::Level; using IndexType = IndexT; - template < - typename = std::enable_if_t || - std::is_same_v>> - Vocabulary() {} + Vocabulary() = default; Vocabulary& operator=(Vocabulary&&) noexcept = default; Vocabulary(Vocabulary&&) noexcept = default; @@ -151,10 +142,7 @@ class Vocabulary { // Get the word with the given `idx`. Throw if the `idx` is not contained // in the vocabulary. - AccessReturnType_t operator[](IndexType idx) const; - - // AccessReturnType_t at(IndexType idx) const { return - // operator[](id); } + AccessReturnType operator[](IndexType idx) const; //! Get the number of words in the vocabulary. [[nodiscard]] size_t size() const { return vocabulary_.size(); } @@ -247,7 +235,12 @@ class Vocabulary { } }; -using RdfsVocabulary = - Vocabulary; -using TextVocabulary = - Vocabulary; +namespace detail { +using UnderlyingVocabRdfsVocabulary = VocabularyInMemory; +using UnderlyingVocabTextVocabulary = VocabularyInMemory; +} // namespace detail + +using RdfsVocabulary = Vocabulary; +using TextVocabulary = Vocabulary; diff --git a/src/index/vocabulary/VocabularyInMemory.cpp b/src/index/vocabulary/VocabularyInMemory.cpp index f3db258d59..a1c82231d3 100644 --- a/src/index/vocabulary/VocabularyInMemory.cpp +++ b/src/index/vocabulary/VocabularyInMemory.cpp @@ -8,9 +8,12 @@ using std::string; // _____________________________________________________________________________ void VocabularyInMemory::open(const string& fileName) { + LOG(INFO) << "Reading vocabulary from file " << fileName << " ..." + << std::endl; _words.clear(); ad_utility::serialization::FileReadSerializer file(fileName); file >> _words; + LOG(INFO) << "Done, number of words: " << size() << std::endl; } // _____________________________________________________________________________ diff --git a/src/index/vocabulary/VocabularyInMemory.h b/src/index/vocabulary/VocabularyInMemory.h index efe9a9c7e7..ed498d1702 100644 --- a/src/index/vocabulary/VocabularyInMemory.h +++ b/src/index/vocabulary/VocabularyInMemory.h @@ -68,12 +68,18 @@ class VocabularyInMemory struct WordWriter { typename Words::Writer writer_; explicit WordWriter(const std::string& filename) : writer_{filename} {} + + // Write a word. The `isExternalDummy` is only there to have a consistent + // interface with the `VocabularyInternalExternal`. void operator()(std::string_view str, [[maybe_unused]] bool isExternalDummy = false) { writer_.push(str.data(), str.size()); } void finish() { writer_.finish(); } + + // The `readableName()` function is only there to have a consistent + // interface with the `VocabularyInternalExternal`. std::string readableNameDummy_; std::string& readableName() { return readableNameDummy_; } }; diff --git a/src/index/vocabulary/VocabularyInternalExternal.cpp b/src/index/vocabulary/VocabularyInternalExternal.cpp index 62c5e29455..3d3d5fffb5 100644 --- a/src/index/vocabulary/VocabularyInternalExternal.cpp +++ b/src/index/vocabulary/VocabularyInternalExternal.cpp @@ -37,3 +37,15 @@ void VocabularyInternalExternal::WordWriter::finish() { internalWriter_.finish(); externalWriter_.finish(); } + +// _____________________________________________________________________________ +void VocabularyInternalExternal::open(const string& filename) { + LOG(INFO) << "Reading vocabulary from file " << filename << " ..." + << std::endl; + internalVocab_.open(filename + ".internal"); + externalVocab_.open(filename + ".external"); + LOG(INFO) << "Done, number of words: " << size() << std::endl; + LOG(INFO) << "Number of words in internal vocabulary (these are also part " + "of the external vocabulary): " + << internalVocab_.size() << std::endl; +} diff --git a/src/index/vocabulary/VocabularyInternalExternal.h b/src/index/vocabulary/VocabularyInternalExternal.h index f9024369bd..d92510a49f 100644 --- a/src/index/vocabulary/VocabularyInternalExternal.h +++ b/src/index/vocabulary/VocabularyInternalExternal.h @@ -40,10 +40,7 @@ class VocabularyInternalExternal { // Read the vocabulary from a file. The file must have been created using a // `WordWriter`. - void open(const string& filename) { - internalVocab_.open(filename + ".internal"); - externalVocab_.open(filename + ".external"); - } + void open(const string& filename); // Return the total number of words [[nodiscard]] size_t size() const { return externalVocab_.size(); } diff --git a/test/engine/TextIndexScanTestHelpers.h b/test/engine/TextIndexScanTestHelpers.h index 6ba1b8c6de..0cc6ae74c6 100644 --- a/test/engine/TextIndexScanTestHelpers.h +++ b/test/engine/TextIndexScanTestHelpers.h @@ -22,8 +22,8 @@ inline string getTextRecordFromResultTable(const QueryExecutionContext* qec, result.idTable().getColumn(0)[rowIndex].getTextRecordIndex().get(); if (nofNonLiterals <= textRecordIdFromTable) { // Return when from Literals - return qec->getIndex().indexToString( - VocabIndex::make(textRecordIdFromTable - nofNonLiterals)); + return std::string{qec->getIndex().indexToString( + VocabIndex::make(textRecordIdFromTable - nofNonLiterals))}; } else { // Return when from DocsDB return qec->getIndex().getTextExcerpt( @@ -41,8 +41,8 @@ inline const TextRecordIndex getTextRecordIdFromResultTable( inline string getEntityFromResultTable(const QueryExecutionContext* qec, const ProtoResult& result, const size_t& rowIndex) { - return qec->getIndex().indexToString( - result.idTable().getColumn(1)[rowIndex].getVocabIndex()); + return std::string{qec->getIndex().indexToString( + result.idTable().getColumn(1)[rowIndex].getVocabIndex())}; } // Only use on prefix search results From e406fa429a3b666f73391e2f4b7a7586a593cfc1 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 31 Jan 2025 11:44:39 +0100 Subject: [PATCH 03/13] Making the vocab configuration configurable at runtime. Signed-off-by: Johannes Kalmbach --- CMakeLists.txt | 10 ++++++++++ src/index/Vocabulary.h | 18 +++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3679de4c51..67b2feb62b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,6 +203,16 @@ if (${USE_CPP_17_BACKPORTS}) add_definitions("-DQLEVER_CPP_17 -DCPP_CXX_CONCEPTS=0") endif() +set(VOCAB_IN_MEMORY OFF CACHE BOOL "Store QLever's vocabulary completely in RAM") +if (${VOCAB_IN_MEMORY}) + add_definitions("-D_QLEVER_VOCAB_IN_MEMORY") +endif () + +set(ENABLE_VOCAB_COMPRESSION ON CACHE BOOL "Compress the vocabulary") +if (${ENABLE_VOCAB_COMPRESSION}) + add_definitions("-D_QLEVER_ENABLE_VOCAB_COMPRESSION") +endif () + # Enable the specification of additional linker flags manually from the commandline set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}") diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index c7a8454a4a..e3513c39d4 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -236,7 +236,23 @@ class Vocabulary { }; namespace detail { -using UnderlyingVocabRdfsVocabulary = VocabularyInMemory; +// The two mactors `_QLEVER_VOCAB_IN_MEMORY` and +// `_QLEVER_ENABLE_VOCAB_COMPRESSION` can be used to disable the external vocab +// and the compression of the vocab at compile time. NOTE: These change the +// binary format of QLever's index, so changing them requires rebuilding of the +// indices. +#ifdef _QLEVER_VOCAB_IN_MEMORY +using VocabStorage = VocabularyInMemory; +#else +using VocabStorage = VocabularyInternalExternal; +#endif + +#ifndef _QLEVER_ENABLE_VOCAB_COMPRESSION +using UnderlyingVocabRdfsVocabulary = VocabStorage; +#else +using UnderlyingVocabRdfsVocabulary = CompressedVocabulary; +#endif + using UnderlyingVocabTextVocabulary = VocabularyInMemory; } // namespace detail From 49445e52badc92b2eae35a41a4b3e9cba80ce2a0 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 5 Feb 2025 08:48:05 +0100 Subject: [PATCH 04/13] An intermediate commit before switching branches. Signed-off-by: Johannes Kalmbach --- src/global/Pattern.h | 3 + src/index/ConstantsIndexBuilding.h | 3 +- src/index/IndexBuilderMain.cpp | 11 ++ src/index/IndexImpl.cpp | 10 +- src/index/IndexImpl.h | 8 + src/index/Vocabulary.h | 14 ++ src/index/VocabularyOnDisk.h | 2 + src/index/vocabulary/CMakeLists.txt | 3 +- src/index/vocabulary/CompressedVocabulary.h | 16 +- src/index/vocabulary/UnicodeVocabulary.h | 8 +- src/index/vocabulary/VocabularyInMemory.h | 10 +- .../vocabulary/VocabularyInMemoryBinSearch.h | 3 + .../vocabulary/VocabularyInternalExternal.h | 12 ++ src/index/vocabulary/VocabularyVariant.cpp | 76 ++++++++++ src/index/vocabulary/VocabularyVariant.h | 143 ++++++++++++++++++ src/util/ProgramOptionsHelpers.h | 52 +++++-- src/util/Serializer/SerializeVector.h | 3 + 17 files changed, 355 insertions(+), 22 deletions(-) create mode 100644 src/index/vocabulary/VocabularyVariant.cpp create mode 100644 src/index/vocabulary/VocabularyVariant.h diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 1005add22d..9c37eb39ce 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -194,6 +194,9 @@ struct CompactStringVectorWriter { commonInitialization(); } + CompactStringVectorWriter(CompactStringVectorWriter&&) = default; + CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default; + void push(const data_type* data, size_t elementSize) { AD_CONTRACT_CHECK(!_finished); _offsets.push_back(_nextOffset); diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h index d7c1802969..4ca58f3e80 100644 --- a/src/index/ConstantsIndexBuilding.h +++ b/src/index/ConstantsIndexBuilding.h @@ -99,7 +99,8 @@ constinit inline std::atomic BUFFER_SIZE_PARTIAL_TO_GLOBAL_ID_MAPPINGS = // the overhead of the metadata that has to be stored per block becomes // infeasible. 250K seems to be a reasonable tradeoff here. constexpr inline ad_utility::MemorySize - UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN = 250_kB; + UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN = + ad_utility::MemorySize::kilobytes(250); constexpr inline size_t NumColumnsIndexBuilding = 4; diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index cfc121a2d1..8877c2d01a 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -11,6 +11,7 @@ #include #include "CompilationInfo.h" +#include "IndexImpl.h" #include "global/Constants.h" #include "index/ConstantsIndexBuilding.h" #include "index/Index.h" @@ -166,6 +167,8 @@ int main(int argc, char** argv) { bool addWordsFromLiterals = false; std::optional stxxlMemory; std::optional parserBufferSize; + std::optional vocabType; + // VocabularyEnum vocabType; optind = 1; Index index{ad_utility::makeUnlimitedAllocator()}; @@ -224,6 +227,9 @@ int main(int argc, char** argv) { add("only-pso-and-pos-permutations,o", po::bool_switch(&onlyPsoAndPos), "Only build the PSO and POS permutations. This is faster, but then " "queries with predicate variables are not supported"); + add("vocabulary-type", po::value(&vocabType), + "The vocabulary implementation for strings in qlever, can be any of ... " + "(TODO joka)"); // Options for the index building process. add("stxxl-memory,m", po::value(&stxxlMemory), @@ -256,6 +262,11 @@ int main(int argc, char** argv) { if (parserBufferSize.has_value()) { index.parserBufferSize() = parserBufferSize.value(); } + /* + if (vocabType.has_value()) { + index.getImpl().setVocabularyTypeForIndexBuilding(vocabType.value()); + } + */ // If no text index name was specified, take the part of the wordsfile after // the last slash. diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index d5781bb297..40ffeb1115 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -341,6 +341,8 @@ void IndexImpl::createFromFiles( "The patterns can only be built when all 6 permutations are created"}; } + vocab_.resetToType(vocabularyTypeForIndexBuilding_); + readIndexBuilderSettingsFromFile(); updateInputFileSpecificationsAndLog(files, useParallelParser_); @@ -560,7 +562,7 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( return (*cmp)(a, b, decltype(vocab_)::SortLevel::TOTAL); }; auto wordCallback = vocab_.makeWordWriter(onDiskBase_ + VOCAB_SUFFIX); - wordCallback.readableName() = "internal vocabulary"; + // wordCallback.readableName() = "internal vocabulary"; return ad_utility::vocabulary_merger::mergeVocabulary( onDiskBase_, numFiles, sortPred, wordCallback, memoryLimitIndexBuilding()); @@ -1132,6 +1134,12 @@ void IndexImpl::readConfiguration() { loadDataMember("num-triples", numTriples_, NumNormalAndInternal{}); loadDataMember("num-non-literals-text-index", nofNonLiteralsInTextIndex_, 0); + // TODO Comment and also write the configuration. + // The default value is the one the used to be the only. + VocabularyEnum vocabType(VocabularyEnum::Enum::CompressedOnDisk); + loadDataMember("vocabulary-type", vocabType, vocabType); + vocab_.resetToType(vocabType); + // Initialize BlankNodeManager uint64_t numBlankNodesTotal; loadDataMember("num-blank-nodes-total", numBlankNodesTotal); diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 8478943c92..ca35b52d86 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -192,6 +192,9 @@ class IndexImpl { std::optional idOfHasPatternDuringIndexBuilding_; std::optional idOfInternalGraphDuringIndexBuilding_; + VocabularyEnum vocabularyTypeForIndexBuilding_{ + VocabularyEnum::Enum::CompressedOnDisk}; + // BlankNodeManager, initialized during `readConfiguration` std::unique_ptr blankNodeManager_{nullptr}; @@ -275,6 +278,11 @@ class IndexImpl { return deltaTriples_.value(); } + void setVocabularyTypeForIndexBuilding(VocabularyEnum type) { + vocabularyTypeForIndexBuilding_ = type; + configurationJson_["vocabulary-type"] = type; + } + // -------------------------------------------------------------------------- // -- RETRIEVAL --- // -------------------------------------------------------------------------- diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index e3513c39d4..0f566cc138 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -27,6 +27,7 @@ #include "index/vocabulary/UnicodeVocabulary.h" #include "index/vocabulary/VocabularyInMemory.h" #include "index/vocabulary/VocabularyInternalExternal.h" +#include "index/vocabulary/VocabularyVariant.h" #include "util/Exception.h" #include "util/HashMap.h" #include "util/HashSet.h" @@ -233,6 +234,13 @@ class Vocabulary { const std::string& filename) const { return vocabulary_.getUnderlyingVocabulary().makeDiskWriter(filename); } + + // TODO Comment. + void resetToType(VocabularyEnum type) { + if constexpr (std::is_same_v) { + vocabulary_.getUnderlyingVocabulary().resetToType(type); + } + } }; namespace detail { @@ -241,18 +249,24 @@ namespace detail { // and the compression of the vocab at compile time. NOTE: These change the // binary format of QLever's index, so changing them requires rebuilding of the // indices. +/* #ifdef _QLEVER_VOCAB_IN_MEMORY using VocabStorage = VocabularyInMemory; #else using VocabStorage = VocabularyInternalExternal; #endif +*/ +/* #ifndef _QLEVER_ENABLE_VOCAB_COMPRESSION using UnderlyingVocabRdfsVocabulary = VocabStorage; #else using UnderlyingVocabRdfsVocabulary = CompressedVocabulary; #endif +*/ +// TODO Change this place. +using UnderlyingVocabRdfsVocabulary = VocabularyVariant; using UnderlyingVocabTextVocabulary = VocabularyInMemory; } // namespace detail diff --git a/src/index/VocabularyOnDisk.h b/src/index/VocabularyOnDisk.h index f677ac3e7a..2b6455cda3 100644 --- a/src/index/VocabularyOnDisk.h +++ b/src/index/VocabularyOnDisk.h @@ -58,6 +58,8 @@ class VocabularyOnDisk : public VocabularyBinarySearchMixin { void finish(); // Destructor. Implicitly calls `finish` if it hasn't been called before. ~WordWriter(); + WordWriter(WordWriter&&) = default; + WordWriter& operator=(WordWriter&&) = default; }; /// Build from a vector of pairs of `(string, id)`. This requires the IDs to diff --git a/src/index/vocabulary/CMakeLists.txt b/src/index/vocabulary/CMakeLists.txt index bb2dfdd4a3..ff3138601e 100644 --- a/src/index/vocabulary/CMakeLists.txt +++ b/src/index/vocabulary/CMakeLists.txt @@ -1,2 +1,3 @@ -add_library(vocabulary VocabularyInMemory.h VocabularyInMemory.cpp VocabularyInMemoryBinSearch.cpp VocabularyInternalExternal.cpp) +add_library(vocabulary VocabularyInMemory.h VocabularyInMemory.cpp VocabularyInMemoryBinSearch.cpp VocabularyInternalExternal.cpp + VocabularyVariant.cpp) qlever_target_link_libraries(vocabulary) diff --git a/src/index/vocabulary/CompressedVocabulary.h b/src/index/vocabulary/CompressedVocabulary.h index dad9e84457..1eeda3599c 100644 --- a/src/index/vocabulary/CompressedVocabulary.h +++ b/src/index/vocabulary/CompressedVocabulary.h @@ -193,6 +193,10 @@ class CompressedVocabulary { delete; DiskWriterFromUncompressedWords& operator=( const DiskWriterFromUncompressedWords&) = delete; + DiskWriterFromUncompressedWords(DiskWriterFromUncompressedWords&&) = + default; + DiskWriterFromUncompressedWords& operator=( + DiskWriterFromUncompressedWords&&) = default; private: // Compress a complete block and write it to the underlying vocabulary. @@ -243,12 +247,20 @@ class CompressedVocabulary { using WordWriter = DiskWriterFromUncompressedWords; // Return a `DiskWriter` that can be used to create the vocabulary. - DiskWriterFromUncompressedWords makeDiskWriter( - const std::string& filename) const { + static DiskWriterFromUncompressedWords makeDiskWriter( + const std::string& filename) { return DiskWriterFromUncompressedWords{ absl::StrCat(filename, wordsSuffix), absl::StrCat(filename, decodersSuffix)}; } + + static std::unique_ptr makeDiskWriterPtr( + const std::string& filename) { + return std::make_unique( + absl::StrCat(filename, wordsSuffix), + absl::StrCat(filename, decodersSuffix)); + } + /// Initialize the vocabulary from the given `words`. // TODO This can be a generic Mixin... void build(const std::vector& words, diff --git a/src/index/vocabulary/UnicodeVocabulary.h b/src/index/vocabulary/UnicodeVocabulary.h index c215843c0f..73dc85556c 100644 --- a/src/index/vocabulary/UnicodeVocabulary.h +++ b/src/index/vocabulary/UnicodeVocabulary.h @@ -102,6 +102,12 @@ class UnicodeVocabulary { void close() { _underlyingVocabulary.close(); } void build(const std::vector& v, const std::string& filename) { - _underlyingVocabulary.build(v, filename); + // TODO This is really hacky, we should get rid of it and make the + // building consistent for all the vocabularies. + if constexpr (requires { _underlyingVocabulary.build(v, filename); }) { + _underlyingVocabulary.build(v, filename); + } else { + AD_FAIL(); + } } }; diff --git a/src/index/vocabulary/VocabularyInMemory.h b/src/index/vocabulary/VocabularyInMemory.h index ed498d1702..a2504ad265 100644 --- a/src/index/vocabulary/VocabularyInMemory.h +++ b/src/index/vocabulary/VocabularyInMemory.h @@ -82,14 +82,20 @@ class VocabularyInMemory // interface with the `VocabularyInternalExternal`. std::string readableNameDummy_; std::string& readableName() { return readableNameDummy_; } + WordWriter(WordWriter&&) = default; + WordWriter& operator=(WordWriter&&) = default; }; // Return a `WordWriter` that directly writes the words to the given // `filename`. The words are not materialized in RAM, but the vocabulary later - // has to be explicitly initizlied via `open(filename)`. - WordWriter makeDiskWriter(const std::string& filename) const { + // has to be explicitly initialized via `open(filename)`. + static WordWriter makeDiskWriter(const std::string& filename) { return WordWriter{filename}; } + static std::unique_ptr makeDiskWriterPtr( + const std::string& filename) { + return std::make_unique(filename); + } /// Clear the vocabulary. void close() { _words.clear(); } diff --git a/src/index/vocabulary/VocabularyInMemoryBinSearch.h b/src/index/vocabulary/VocabularyInMemoryBinSearch.h index 8367c1e965..df2314eb81 100644 --- a/src/index/vocabulary/VocabularyInMemoryBinSearch.h +++ b/src/index/vocabulary/VocabularyInMemoryBinSearch.h @@ -79,6 +79,9 @@ class VocabularyInMemoryBinSearch // Finish writing and dump all contents that still reside in buffers to // disk. void finish(); + + WordWriter(WordWriter&&) = default; + WordWriter& operator=(WordWriter&&) = default; }; // Clear the vocabulary. diff --git a/src/index/vocabulary/VocabularyInternalExternal.h b/src/index/vocabulary/VocabularyInternalExternal.h index d92510a49f..491381a88e 100644 --- a/src/index/vocabulary/VocabularyInternalExternal.h +++ b/src/index/vocabulary/VocabularyInternalExternal.h @@ -112,6 +112,9 @@ class VocabularyInternalExternal { // Finish writing. void finish(); + + WordWriter(WordWriter&&) = default; + WordWriter& operator=(WordWriter&&) = default; }; /// Clear the vocabulary. @@ -148,4 +151,13 @@ class VocabularyInternalExternal { return boundFunction(externalVocab_, word, comparator, boundFromInternalVocab.previousIndex(), upperBound); } + + public: + // TODO Clean up positions + static WordWriter makeDiskWriter(const std::string& filename) { + return WordWriter{filename}; + } + static auto makeDiskWriterPtr(const std::string& filename) { + return std::make_unique(filename); + } }; diff --git a/src/index/vocabulary/VocabularyVariant.cpp b/src/index/vocabulary/VocabularyVariant.cpp new file mode 100644 index 0000000000..f8dca2b45d --- /dev/null +++ b/src/index/vocabulary/VocabularyVariant.cpp @@ -0,0 +1,76 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include "index/vocabulary/VocabularyVariant.h" + +#include + +void VocabularyVariant::open(const std::string& filename) { + std::visit([&filename](auto& vocab) { vocab.open(filename); }, vocab_); +} + +void VocabularyVariant::open(const std::string& filename, VocabularyEnum type) { + resetToType(type); + open(filename); +} + +void VocabularyVariant::close() { + return std::visit([](auto& vocab) { return vocab.close(); }, vocab_); +} +size_t VocabularyVariant::size() const { + return std::visit([](auto& vocab) { return vocab.size(); }, vocab_); +} +std::string VocabularyVariant::operator[](uint64_t i) const { + return std::visit([i](auto& vocab) { return std::string{vocab[i]}; }, vocab_); +} + +VocabularyVariant::WordWriter::WordWriter(WordWriters writer) + : writer_(std::move(writer)) {} + +void VocabularyVariant::WordWriter::finish() { + std::visit([](auto& writer) { return writer->finish(); }, writer_); +} + +void VocabularyVariant::WordWriter::operator()(std::string_view word, + bool isExternal) { + std::visit( + [&word, isExternal](auto& writer) { return (*writer)(word, isExternal); }, + writer_); +} + +auto VocabularyVariant::makeDiskWriter(const std::string& filename) const + -> WordWriter { + return WordWriter{std::visit( + [&filename](auto& vocab) -> WordWriters { + return vocab.makeDiskWriterPtr(filename); + }, + vocab_)}; +} + +VocabularyVariant::WordWriter VocabularyVariant::makeDiskWriter( + const std::string& filename, VocabularyEnum type) { + VocabularyVariant dummyVocab; + dummyVocab.resetToType(type); + return dummyVocab.makeDiskWriter(filename); +} + +void VocabularyVariant::resetToType(VocabularyEnum type) { + close(); + switch (type.value()) { + case VocabularyEnum::Enum::InMemory: + vocab_.emplace(); + break; + case VocabularyEnum::Enum::OnDisk: + vocab_.emplace(); + break; + case VocabularyEnum::Enum::CompressedInMemory: + vocab_.emplace(); + break; + case VocabularyEnum::Enum::CompressedOnDisk: + vocab_.emplace(); + break; + default: + AD_FAIL(); + } +} diff --git a/src/index/vocabulary/VocabularyVariant.h b/src/index/vocabulary/VocabularyVariant.h new file mode 100644 index 0000000000..355fd58abd --- /dev/null +++ b/src/index/vocabulary/VocabularyVariant.h @@ -0,0 +1,143 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#pragma once +#include +#include + +#include + +#include "index/vocabulary/CompressedVocabulary.h" +#include "index/vocabulary/VocabularyInMemory.h" +#include "index/vocabulary/VocabularyInternalExternal.h" +#include "util/json.h" + +template +static constexpr auto getWordWriterTypes(const Variant& var) { + return std::apply( + [](const Vocab&...) { + return std::type_identity< + std::variant...>>{}; + }, + var); +} + +class VocabularyEnum { + public: + enum struct Enum { InMemory, OnDisk, CompressedInMemory, CompressedOnDisk }; + + private: + Enum value_ = Enum::InMemory; + + static constexpr std::array descriptions{ + "in-memory-uncompressed", "on-disk-uncompressed", "in-memory-compressed", + "on-disk-compressed"}; + + public: + VocabularyEnum() = default; + explicit VocabularyEnum(Enum value) : value_{value} {} + + static VocabularyEnum fromString(std::string_view description) { + auto it = ql::ranges::find(descriptions, description); + if (it == descriptions.end()) { + throw std::runtime_error{ + absl::StrCat("\"", description, + "\" is not a valid vocabulary type. The currently " + "supported vocabulary types are ", + absl::StrJoin(descriptions, ", "))}; + ; + } + return VocabularyEnum{static_cast(it - descriptions.begin())}; + } + std::string_view toString() const { + return descriptions.at(static_cast(value_)); + } + + Enum value() const { return value_; } + + // Conversion To JSON. + friend void to_json(nlohmann::json& j, const VocabularyEnum& vocabEnum) { + j = vocabEnum.toString(); + } + + // Conversion from JSON. + friend void from_json(const nlohmann::json& j, VocabularyEnum& vocabEnum) { + vocabEnum = VocabularyEnum::fromString(static_cast(j)); + } +}; + +class VocabularyVariant { + private: + using InMemory = VocabularyInMemory; + using External = VocabularyInternalExternal; + using CompressedInMemory = CompressedVocabulary; + using CompressedExternal = CompressedVocabulary; + using Variant = + std::variant; + using Tuple = + std::tuple; + + Variant vocab_; + + public: + void resetToType(VocabularyEnum); + void open(const std::string& filename); + void open(const std::string& filename, VocabularyEnum type); + void close(); + size_t size() const; + std::string operator[](uint64_t i) const; + + template + WordAndIndex lower_bound(const String& word, Comp comp) const { + return std::visit( + [&word, &comp](auto& vocab) { + return vocab.lower_bound(word, std::move(comp)); + }, + vocab_); + } + + template + WordAndIndex lower_bound_iterator(const String& word, Comp comp) const { + return std::visit( + [&word, &comp](auto& vocab) { + return vocab.lower_bound_iterator(word, std::move(comp)); + }, + vocab_); + } + + template + WordAndIndex upper_bound(const String& word, Comp comp) const { + return std::visit( + [&word, &comp](auto& vocab) { + return vocab.upper_bound(word, std::move(comp)); + }, + vocab_); + } + + template + WordAndIndex upper_bound_iterator(const String& word, Comp comp) const { + return std::visit( + [&word, &comp](auto& vocab) { + return vocab.upper_bound_iterator(word, std::move(comp)); + }, + vocab_); + } + + using WordWriters = decltype(getWordWriterTypes(std::declval()))::type; + + class WordWriter { + WordWriters writer_; + + public: + explicit WordWriter(WordWriters); + + void finish(); + + void operator()(std::string_view word, bool isExternal); + }; + + WordWriter makeDiskWriter(const std::string& filename) const; + static WordWriter makeDiskWriter(const std::string& filename, + VocabularyEnum type); +}; diff --git a/src/util/ProgramOptionsHelpers.h b/src/util/ProgramOptionsHelpers.h index bd804504d3..6c25565287 100644 --- a/src/util/ProgramOptionsHelpers.h +++ b/src/util/ProgramOptionsHelpers.h @@ -11,6 +11,8 @@ #include "util/Concepts.h" #include "util/MemorySize/MemorySize.h" #include "util/Parameters.h" +// TODO only include the enum. +#include "index/vocabulary/VocabularyVariant.h" namespace ad_utility { // An implicit wrapper that can be implicitly converted to and from `size_t`. @@ -47,20 +49,6 @@ inline void validate(boost::any& v, const std::vector& values, v = NonNegative{boost::lexical_cast(s)}; } -// This function is required to use `std::optional` in -// `boost::program_options`. -template -void validate(boost::any& v, const std::vector& values, - std::optional*, int) { - // First parse as a T - T* dummy = nullptr; - validate(v, values, dummy, 0); - - // Wrap the T inside std::optional - AD_CONTRACT_CHECK(!v.empty()); - v = std::optional(boost::any_cast(v)); -} - // This function is required to use `MemorySize` in `boost::program_options`. inline void validate(boost::any& v, const std::vector& values, MemorySize*, int) { @@ -120,4 +108,40 @@ class ParameterToProgramOptionFactory { } // namespace ad_utility +// This function is required to use `VocabularyEnum` in +// `boost::program_options`. +inline void validate(boost::any& v, const std::vector& values, + VocabularyEnum*, int) { + using namespace boost::program_options; + + // Make sure no previous assignment to 'v' was made. + validators::check_first_occurrence(v); + // Extract the first string from 'values'. If there is more than + // one string, it's an error, and exception will be thrown. + const string& s = validators::get_single_string(values); + + // Convert the string to `MemorySize` and put it into the option. + v = VocabularyEnum::fromString(s); +} + +// This function is required to use `std::optional` in +// `boost::program_options`. +// TODO We should find a solution that doesn't require opening +// namespace `std`, for example we could put all types + this function into the +// `ad_utility`namespace. +namespace std { +template +void validate(boost::any& v, const std::vector& values, + std::optional*, int) { + // First parse as a T + T* dummy = nullptr; + // using namespace boost::program_options; + validate(v, values, dummy, 0); + + // Wrap the T inside std::optional + AD_CONTRACT_CHECK(!v.empty()); + v = std::optional(boost::any_cast(v)); +} +} // namespace std + #endif // QLEVER_PROGRAMOPTIONSHELPERS_H diff --git a/src/util/Serializer/SerializeVector.h b/src/util/Serializer/SerializeVector.h index 982e43e2ff..d093f4c02f 100644 --- a/src/util/Serializer/SerializeVector.h +++ b/src/util/Serializer/SerializeVector.h @@ -75,6 +75,9 @@ class VectorIncrementalSerializer { } ~VectorIncrementalSerializer() { finish(); } + VectorIncrementalSerializer(VectorIncrementalSerializer&&) = default; + VectorIncrementalSerializer& operator=(VectorIncrementalSerializer&&) = + default; }; } // namespace ad_utility::serialization From 6d11c3ba8d03532a6eba846599ec7696a614bdea Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 5 Feb 2025 11:41:29 +0100 Subject: [PATCH 05/13] This seems to work, but the IDE has crashed, so we just restart:) Signed-off-by: Johannes Kalmbach --- src/index/IndexBuilderMain.cpp | 2 +- src/index/IndexImpl.cpp | 4 +- src/index/IndexImpl.h | 6 +- src/index/Vocabulary.cpp | 12 +++- src/index/Vocabulary.h | 2 +- src/index/VocabularyOnDisk.h | 2 - src/index/vocabulary/CMakeLists.txt | 3 +- src/index/vocabulary/CompressedVocabulary.h | 17 +---- src/index/vocabulary/UnicodeVocabulary.h | 10 --- src/index/vocabulary/VocabularyInMemory.h | 21 +----- .../vocabulary/VocabularyInMemoryBinSearch.h | 3 - .../vocabulary/VocabularyInternalExternal.h | 21 +++--- src/index/vocabulary/VocabularyType.h | 56 +++++++++++++++ src/index/vocabulary/VocabularyVariant.cpp | 14 ++-- src/index/vocabulary/VocabularyVariant.h | 70 +++++-------------- src/util/ProgramOptionsHelpers.h | 37 +++++----- src/util/Serializer/SerializeVector.h | 3 - 17 files changed, 128 insertions(+), 155 deletions(-) create mode 100644 src/index/vocabulary/VocabularyType.h diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index 8877c2d01a..1583a9a14f 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -167,7 +167,7 @@ int main(int argc, char** argv) { bool addWordsFromLiterals = false; std::optional stxxlMemory; std::optional parserBufferSize; - std::optional vocabType; + std::optional vocabType; // VocabularyEnum vocabType; optind = 1; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 40ffeb1115..dd29e6d57a 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -562,7 +562,6 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( return (*cmp)(a, b, decltype(vocab_)::SortLevel::TOTAL); }; auto wordCallback = vocab_.makeWordWriter(onDiskBase_ + VOCAB_SUFFIX); - // wordCallback.readableName() = "internal vocabulary"; return ad_utility::vocabulary_merger::mergeVocabulary( onDiskBase_, numFiles, sortPred, wordCallback, memoryLimitIndexBuilding()); @@ -1136,7 +1135,8 @@ void IndexImpl::readConfiguration() { // TODO Comment and also write the configuration. // The default value is the one the used to be the only. - VocabularyEnum vocabType(VocabularyEnum::Enum::CompressedOnDisk); + ad_utility::VocabularyEnum vocabType( + ad_utility::VocabularyEnum::Enum::CompressedOnDisk); loadDataMember("vocabulary-type", vocabType, vocabType); vocab_.resetToType(vocabType); diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index ca35b52d86..7c4a937fdb 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -192,8 +192,8 @@ class IndexImpl { std::optional idOfHasPatternDuringIndexBuilding_; std::optional idOfInternalGraphDuringIndexBuilding_; - VocabularyEnum vocabularyTypeForIndexBuilding_{ - VocabularyEnum::Enum::CompressedOnDisk}; + ad_utility::VocabularyEnum vocabularyTypeForIndexBuilding_{ + ad_utility::VocabularyEnum::Enum::CompressedOnDisk}; // BlankNodeManager, initialized during `readConfiguration` std::unique_ptr blankNodeManager_{nullptr}; @@ -278,7 +278,7 @@ class IndexImpl { return deltaTriples_.value(); } - void setVocabularyTypeForIndexBuilding(VocabularyEnum type) { + void setVocabularyTypeForIndexBuilding(ad_utility::VocabularyEnum type) { vocabularyTypeForIndexBuilding_ = type; configurationJson_["vocabulary-type"] = type; } diff --git a/src/index/Vocabulary.cpp b/src/index/Vocabulary.cpp index 70e9f0c50e..80c61cc0ea 100644 --- a/src/index/Vocabulary.cpp +++ b/src/index/Vocabulary.cpp @@ -63,7 +63,17 @@ void Vocabulary::createFromSet( return getCaseComparator()(a, b, SortLevel::TOTAL); }; std::sort(begin(words), end(words), totalComparison); - vocabulary_.build(words, filename); + auto writer = makeWordWriter(filename); + auto writeWords = [&writer](std::string_view word) { + // All words are stored in the internal vocab (this is consistent with the + // previous behavior). NOTE: This function is currently only used for the + // text index and for few unit tests, where we don't have an external + // vocabulary anyway. + writer(word, false); + }; + ql::ranges::for_each(words, writeWords); + writer.finish(); + vocabulary_.open(filename); LOG(DEBUG) << "END Vocabulary::createFromSet" << std::endl; } diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index 0f566cc138..898233e284 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -236,7 +236,7 @@ class Vocabulary { } // TODO Comment. - void resetToType(VocabularyEnum type) { + void resetToType(ad_utility::VocabularyEnum type) { if constexpr (std::is_same_v) { vocabulary_.getUnderlyingVocabulary().resetToType(type); } diff --git a/src/index/VocabularyOnDisk.h b/src/index/VocabularyOnDisk.h index 2b6455cda3..f677ac3e7a 100644 --- a/src/index/VocabularyOnDisk.h +++ b/src/index/VocabularyOnDisk.h @@ -58,8 +58,6 @@ class VocabularyOnDisk : public VocabularyBinarySearchMixin { void finish(); // Destructor. Implicitly calls `finish` if it hasn't been called before. ~WordWriter(); - WordWriter(WordWriter&&) = default; - WordWriter& operator=(WordWriter&&) = default; }; /// Build from a vector of pairs of `(string, id)`. This requires the IDs to diff --git a/src/index/vocabulary/CMakeLists.txt b/src/index/vocabulary/CMakeLists.txt index ff3138601e..151f8ec18c 100644 --- a/src/index/vocabulary/CMakeLists.txt +++ b/src/index/vocabulary/CMakeLists.txt @@ -1,3 +1,4 @@ add_library(vocabulary VocabularyInMemory.h VocabularyInMemory.cpp VocabularyInMemoryBinSearch.cpp VocabularyInternalExternal.cpp - VocabularyVariant.cpp) + VocabularyVariant.cpp + VocabularyType.h) qlever_target_link_libraries(vocabulary) diff --git a/src/index/vocabulary/CompressedVocabulary.h b/src/index/vocabulary/CompressedVocabulary.h index 1eeda3599c..7f685750d4 100644 --- a/src/index/vocabulary/CompressedVocabulary.h +++ b/src/index/vocabulary/CompressedVocabulary.h @@ -193,10 +193,6 @@ class CompressedVocabulary { delete; DiskWriterFromUncompressedWords& operator=( const DiskWriterFromUncompressedWords&) = delete; - DiskWriterFromUncompressedWords(DiskWriterFromUncompressedWords&&) = - default; - DiskWriterFromUncompressedWords& operator=( - DiskWriterFromUncompressedWords&&) = default; private: // Compress a complete block and write it to the underlying vocabulary. @@ -254,6 +250,7 @@ class CompressedVocabulary { absl::StrCat(filename, decodersSuffix)}; } + // Return a `unique_ptr`. static std::unique_ptr makeDiskWriterPtr( const std::string& filename) { return std::make_unique( @@ -261,18 +258,6 @@ class CompressedVocabulary { absl::StrCat(filename, decodersSuffix)); } - /// Initialize the vocabulary from the given `words`. - // TODO This can be a generic Mixin... - void build(const std::vector& words, - const std::string& filename) { - WordWriter writer = makeDiskWriter(filename); - for (const auto& word : words) { - writer(word); - } - writer.finish(); - open(filename); - } - // Access to the underlying vocabulary. UnderlyingVocabulary& getUnderlyingVocabulary() { return underlyingVocabulary_; diff --git a/src/index/vocabulary/UnicodeVocabulary.h b/src/index/vocabulary/UnicodeVocabulary.h index 73dc85556c..66aaaf0d67 100644 --- a/src/index/vocabulary/UnicodeVocabulary.h +++ b/src/index/vocabulary/UnicodeVocabulary.h @@ -100,14 +100,4 @@ class UnicodeVocabulary { const UnicodeComparator& getComparator() const { return _comparator; } void close() { _underlyingVocabulary.close(); } - - void build(const std::vector& v, const std::string& filename) { - // TODO This is really hacky, we should get rid of it and make the - // building consistent for all the vocabularies. - if constexpr (requires { _underlyingVocabulary.build(v, filename); }) { - _underlyingVocabulary.build(v, filename); - } else { - AD_FAIL(); - } - } }; diff --git a/src/index/vocabulary/VocabularyInMemory.h b/src/index/vocabulary/VocabularyInMemory.h index a2504ad265..6d68e2a6f6 100644 --- a/src/index/vocabulary/VocabularyInMemory.h +++ b/src/index/vocabulary/VocabularyInMemory.h @@ -77,13 +77,6 @@ class VocabularyInMemory } void finish() { writer_.finish(); } - - // The `readableName()` function is only there to have a consistent - // interface with the `VocabularyInternalExternal`. - std::string readableNameDummy_; - std::string& readableName() { return readableNameDummy_; } - WordWriter(WordWriter&&) = default; - WordWriter& operator=(WordWriter&&) = default; }; // Return a `WordWriter` that directly writes the words to the given @@ -92,6 +85,9 @@ class VocabularyInMemory static WordWriter makeDiskWriter(const std::string& filename) { return WordWriter{filename}; } + + // Same as `makeDiskWriter` above, but the result is returned via + // `unique_ptr`. static std::unique_ptr makeDiskWriterPtr( const std::string& filename) { return std::make_unique(filename); @@ -100,17 +96,6 @@ class VocabularyInMemory /// Clear the vocabulary. void close() { _words.clear(); } - /// Initialize the vocabulary from the given `words`. - void build(const std::vector& words, - const std::string& filename) { - WordWriter writer = makeDiskWriter(filename); - for (const auto& word : words) { - writer(word); - } - writer.finish(); - open(filename); - } - // Const access to the underlying words. auto begin() const { return _words.begin(); } auto end() const { return _words.end(); } diff --git a/src/index/vocabulary/VocabularyInMemoryBinSearch.h b/src/index/vocabulary/VocabularyInMemoryBinSearch.h index df2314eb81..8367c1e965 100644 --- a/src/index/vocabulary/VocabularyInMemoryBinSearch.h +++ b/src/index/vocabulary/VocabularyInMemoryBinSearch.h @@ -79,9 +79,6 @@ class VocabularyInMemoryBinSearch // Finish writing and dump all contents that still reside in buffers to // disk. void finish(); - - WordWriter(WordWriter&&) = default; - WordWriter& operator=(WordWriter&&) = default; }; // Clear the vocabulary. diff --git a/src/index/vocabulary/VocabularyInternalExternal.h b/src/index/vocabulary/VocabularyInternalExternal.h index 491381a88e..897b29258d 100644 --- a/src/index/vocabulary/VocabularyInternalExternal.h +++ b/src/index/vocabulary/VocabularyInternalExternal.h @@ -112,11 +112,17 @@ class VocabularyInternalExternal { // Finish writing. void finish(); - - WordWriter(WordWriter&&) = default; - WordWriter& operator=(WordWriter&&) = default; }; + // Return a `WordWriter` or (in the second function) a + // `unique_ptr` for the given filename. + static WordWriter makeDiskWriter(const std::string& filename) { + return WordWriter{filename}; + } + static auto makeDiskWriterPtr(const std::string& filename) { + return std::make_unique(filename); + } + /// Clear the vocabulary. void close() { internalVocab_.close(); } @@ -151,13 +157,4 @@ class VocabularyInternalExternal { return boundFunction(externalVocab_, word, comparator, boundFromInternalVocab.previousIndex(), upperBound); } - - public: - // TODO Clean up positions - static WordWriter makeDiskWriter(const std::string& filename) { - return WordWriter{filename}; - } - static auto makeDiskWriterPtr(const std::string& filename) { - return std::make_unique(filename); - } }; diff --git a/src/index/vocabulary/VocabularyType.h b/src/index/vocabulary/VocabularyType.h new file mode 100644 index 0000000000..21474023a3 --- /dev/null +++ b/src/index/vocabulary/VocabularyType.h @@ -0,0 +1,56 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#pragma once + +#include +#include + +#include "util/json.h" + +namespace ad_utility { +class VocabularyEnum { + public: + enum struct Enum { InMemory, OnDisk, CompressedInMemory, CompressedOnDisk }; + + private: + Enum value_ = Enum::InMemory; + + static constexpr std::array descriptions{ + "in-memory-uncompressed", "on-disk-uncompressed", "in-memory-compressed", + "on-disk-compressed"}; + + public: + VocabularyEnum() = default; + explicit VocabularyEnum(Enum value) : value_{value} {} + + static VocabularyEnum fromString(std::string_view description) { + auto it = ql::ranges::find(descriptions, description); + if (it == descriptions.end()) { + throw std::runtime_error{ + absl::StrCat("\"", description, + "\" is not a valid vocabulary type. The currently " + "supported vocabulary types are ", + absl::StrJoin(descriptions, ", "))}; + ; + } + return VocabularyEnum{static_cast(it - descriptions.begin())}; + } + std::string_view toString() const { + return descriptions.at(static_cast(value_)); + } + + Enum value() const { return value_; } + + // Conversion To JSON. + friend void to_json(nlohmann::json& j, const VocabularyEnum& vocabEnum) { + j = vocabEnum.toString(); + } + + // Conversion from JSON. + friend void from_json(const nlohmann::json& j, VocabularyEnum& vocabEnum) { + vocabEnum = VocabularyEnum::fromString(static_cast(j)); + } +}; +} // namespace ad_utility diff --git a/src/index/vocabulary/VocabularyVariant.cpp b/src/index/vocabulary/VocabularyVariant.cpp index f8dca2b45d..504591116e 100644 --- a/src/index/vocabulary/VocabularyVariant.cpp +++ b/src/index/vocabulary/VocabularyVariant.cpp @@ -10,7 +10,7 @@ void VocabularyVariant::open(const std::string& filename) { std::visit([&filename](auto& vocab) { vocab.open(filename); }, vocab_); } -void VocabularyVariant::open(const std::string& filename, VocabularyEnum type) { +void VocabularyVariant::open(const std::string& filename, VocabularyType type) { resetToType(type); open(filename); } @@ -49,25 +49,25 @@ auto VocabularyVariant::makeDiskWriter(const std::string& filename) const } VocabularyVariant::WordWriter VocabularyVariant::makeDiskWriter( - const std::string& filename, VocabularyEnum type) { + const std::string& filename, VocabularyType type) { VocabularyVariant dummyVocab; dummyVocab.resetToType(type); return dummyVocab.makeDiskWriter(filename); } -void VocabularyVariant::resetToType(VocabularyEnum type) { +void VocabularyVariant::resetToType(VocabularyType type) { close(); switch (type.value()) { - case VocabularyEnum::Enum::InMemory: + case VocabularyType::Enum::InMemory: vocab_.emplace(); break; - case VocabularyEnum::Enum::OnDisk: + case VocabularyType::Enum::OnDisk: vocab_.emplace(); break; - case VocabularyEnum::Enum::CompressedInMemory: + case VocabularyType::Enum::CompressedInMemory: vocab_.emplace(); break; - case VocabularyEnum::Enum::CompressedOnDisk: + case VocabularyType::Enum::CompressedOnDisk: vocab_.emplace(); break; default: diff --git a/src/index/vocabulary/VocabularyVariant.h b/src/index/vocabulary/VocabularyVariant.h index 355fd58abd..7ec162890d 100644 --- a/src/index/vocabulary/VocabularyVariant.h +++ b/src/index/vocabulary/VocabularyVariant.h @@ -11,63 +11,24 @@ #include "index/vocabulary/CompressedVocabulary.h" #include "index/vocabulary/VocabularyInMemory.h" #include "index/vocabulary/VocabularyInternalExternal.h" +#include "index/vocabulary/VocabularyType.h" #include "util/json.h" -template -static constexpr auto getWordWriterTypes(const Variant& var) { - return std::apply( - [](const Vocab&...) { - return std::type_identity< - std::variant...>>{}; - }, - var); -} - -class VocabularyEnum { - public: - enum struct Enum { InMemory, OnDisk, CompressedInMemory, CompressedOnDisk }; - - private: - Enum value_ = Enum::InMemory; - - static constexpr std::array descriptions{ - "in-memory-uncompressed", "on-disk-uncompressed", "in-memory-compressed", - "on-disk-compressed"}; +namespace polymorphic_vocabulary::detail { - public: - VocabularyEnum() = default; - explicit VocabularyEnum(Enum value) : value_{value} {} - - static VocabularyEnum fromString(std::string_view description) { - auto it = ql::ranges::find(descriptions, description); - if (it == descriptions.end()) { - throw std::runtime_error{ - absl::StrCat("\"", description, - "\" is not a valid vocabulary type. The currently " - "supported vocabulary types are ", - absl::StrJoin(descriptions, ", "))}; - ; - } - return VocabularyEnum{static_cast(it - descriptions.begin())}; - } - std::string_view toString() const { - return descriptions.at(static_cast(value_)); - } +template +struct WriterPointers {}; - Enum value() const { return value_; } - - // Conversion To JSON. - friend void to_json(nlohmann::json& j, const VocabularyEnum& vocabEnum) { - j = vocabEnum.toString(); - } - - // Conversion from JSON. - friend void from_json(const nlohmann::json& j, VocabularyEnum& vocabEnum) { - vocabEnum = VocabularyEnum::fromString(static_cast(j)); - } +template +struct WriterPointers> { + using type = std::variant...>; }; +} // namespace polymorphic_vocabulary::detail class VocabularyVariant { + public: + using VocabularyType = ad_utility::VocabularyEnum; + private: using InMemory = VocabularyInMemory; using External = VocabularyInternalExternal; @@ -81,9 +42,9 @@ class VocabularyVariant { Variant vocab_; public: - void resetToType(VocabularyEnum); + void resetToType(VocabularyType); void open(const std::string& filename); - void open(const std::string& filename, VocabularyEnum type); + void open(const std::string& filename, VocabularyType type); void close(); size_t size() const; std::string operator[](uint64_t i) const; @@ -124,7 +85,8 @@ class VocabularyVariant { vocab_); } - using WordWriters = decltype(getWordWriterTypes(std::declval()))::type; + using WordWriters = + polymorphic_vocabulary::detail::WriterPointers::type; class WordWriter { WordWriters writer_; @@ -139,5 +101,5 @@ class VocabularyVariant { WordWriter makeDiskWriter(const std::string& filename) const; static WordWriter makeDiskWriter(const std::string& filename, - VocabularyEnum type); + VocabularyType type); }; diff --git a/src/util/ProgramOptionsHelpers.h b/src/util/ProgramOptionsHelpers.h index 6c25565287..0d3ede6a1a 100644 --- a/src/util/ProgramOptionsHelpers.h +++ b/src/util/ProgramOptionsHelpers.h @@ -49,6 +49,21 @@ inline void validate(boost::any& v, const std::vector& values, v = NonNegative{boost::lexical_cast(s)}; } +// This function is required to use `std::optional` in +// `boost::program_options`. +template +void validate(boost::any& v, const std::vector& values, + std::optional*, int) { + // First parse as a T + T* dummy = nullptr; + // using namespace boost::program_options; + validate(v, values, dummy, 0); + + // Wrap the T inside std::optional + AD_CONTRACT_CHECK(!v.empty()); + v = std::optional(boost::any_cast(v)); +} + // This function is required to use `MemorySize` in `boost::program_options`. inline void validate(boost::any& v, const std::vector& values, MemorySize*, int) { @@ -106,8 +121,6 @@ class ParameterToProgramOptionFactory { } }; -} // namespace ad_utility - // This function is required to use `VocabularyEnum` in // `boost::program_options`. inline void validate(boost::any& v, const std::vector& values, @@ -124,24 +137,6 @@ inline void validate(boost::any& v, const std::vector& values, v = VocabularyEnum::fromString(s); } -// This function is required to use `std::optional` in -// `boost::program_options`. -// TODO We should find a solution that doesn't require opening -// namespace `std`, for example we could put all types + this function into the -// `ad_utility`namespace. -namespace std { -template -void validate(boost::any& v, const std::vector& values, - std::optional*, int) { - // First parse as a T - T* dummy = nullptr; - // using namespace boost::program_options; - validate(v, values, dummy, 0); - - // Wrap the T inside std::optional - AD_CONTRACT_CHECK(!v.empty()); - v = std::optional(boost::any_cast(v)); -} -} // namespace std +} // namespace ad_utility #endif // QLEVER_PROGRAMOPTIONSHELPERS_H diff --git a/src/util/Serializer/SerializeVector.h b/src/util/Serializer/SerializeVector.h index d093f4c02f..982e43e2ff 100644 --- a/src/util/Serializer/SerializeVector.h +++ b/src/util/Serializer/SerializeVector.h @@ -75,9 +75,6 @@ class VectorIncrementalSerializer { } ~VectorIncrementalSerializer() { finish(); } - VectorIncrementalSerializer(VectorIncrementalSerializer&&) = default; - VectorIncrementalSerializer& operator=(VectorIncrementalSerializer&&) = - default; }; } // namespace ad_utility::serialization From 3e7f49476c5f344bd44a5a447a8ee20653c4adcf Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 5 Feb 2025 18:53:41 +0100 Subject: [PATCH 06/13] Several refactorings. Signed-off-by: Johannes Kalmbach --- src/index/IndexBuilderMain.cpp | 2 +- src/index/IndexImpl.cpp | 4 +- src/index/IndexImpl.h | 6 +- src/index/Vocabulary.h | 8 +- src/index/vocabulary/CMakeLists.txt | 2 +- .../vocabulary/PolymorphicVocabulary.cpp | 90 +++++++++++++++++++ ...ularyVariant.h => PolymorphicVocabulary.h} | 53 +++++++++-- src/index/vocabulary/VocabularyType.h | 16 ++-- src/index/vocabulary/VocabularyVariant.cpp | 76 ---------------- src/util/ProgramOptionsHelpers.h | 7 +- 10 files changed, 158 insertions(+), 106 deletions(-) create mode 100644 src/index/vocabulary/PolymorphicVocabulary.cpp rename src/index/vocabulary/{VocabularyVariant.h => PolymorphicVocabulary.h} (56%) delete mode 100644 src/index/vocabulary/VocabularyVariant.cpp diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index 1583a9a14f..c75fd5d427 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -167,7 +167,7 @@ int main(int argc, char** argv) { bool addWordsFromLiterals = false; std::optional stxxlMemory; std::optional parserBufferSize; - std::optional vocabType; + std::optional vocabType; // VocabularyEnum vocabType; optind = 1; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index dd29e6d57a..3ad2e997ec 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1135,8 +1135,8 @@ void IndexImpl::readConfiguration() { // TODO Comment and also write the configuration. // The default value is the one the used to be the only. - ad_utility::VocabularyEnum vocabType( - ad_utility::VocabularyEnum::Enum::CompressedOnDisk); + ad_utility::VocabularyType vocabType( + ad_utility::VocabularyType::Enum::CompressedOnDisk); loadDataMember("vocabulary-type", vocabType, vocabType); vocab_.resetToType(vocabType); diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 7c4a937fdb..aaa6d0a1f1 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -192,8 +192,8 @@ class IndexImpl { std::optional idOfHasPatternDuringIndexBuilding_; std::optional idOfInternalGraphDuringIndexBuilding_; - ad_utility::VocabularyEnum vocabularyTypeForIndexBuilding_{ - ad_utility::VocabularyEnum::Enum::CompressedOnDisk}; + ad_utility::VocabularyType vocabularyTypeForIndexBuilding_{ + ad_utility::VocabularyType::Enum::CompressedOnDisk}; // BlankNodeManager, initialized during `readConfiguration` std::unique_ptr blankNodeManager_{nullptr}; @@ -278,7 +278,7 @@ class IndexImpl { return deltaTriples_.value(); } - void setVocabularyTypeForIndexBuilding(ad_utility::VocabularyEnum type) { + void setVocabularyTypeForIndexBuilding(ad_utility::VocabularyType type) { vocabularyTypeForIndexBuilding_ = type; configurationJson_["vocabulary-type"] = type; } diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index 898233e284..46af9c8c56 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -24,10 +24,10 @@ #include "index/StringSortComparator.h" #include "index/VocabularyOnDisk.h" #include "index/vocabulary/CompressedVocabulary.h" +#include "index/vocabulary/PolymorphicVocabulary.h" #include "index/vocabulary/UnicodeVocabulary.h" #include "index/vocabulary/VocabularyInMemory.h" #include "index/vocabulary/VocabularyInternalExternal.h" -#include "index/vocabulary/VocabularyVariant.h" #include "util/Exception.h" #include "util/HashMap.h" #include "util/HashSet.h" @@ -236,8 +236,8 @@ class Vocabulary { } // TODO Comment. - void resetToType(ad_utility::VocabularyEnum type) { - if constexpr (std::is_same_v) { + void resetToType(ad_utility::VocabularyType type) { + if constexpr (std::is_same_v) { vocabulary_.getUnderlyingVocabulary().resetToType(type); } } @@ -266,7 +266,7 @@ using UnderlyingVocabRdfsVocabulary = CompressedVocabulary; */ // TODO Change this place. -using UnderlyingVocabRdfsVocabulary = VocabularyVariant; +using UnderlyingVocabRdfsVocabulary = PolymorphicVocabulary; using UnderlyingVocabTextVocabulary = VocabularyInMemory; } // namespace detail diff --git a/src/index/vocabulary/CMakeLists.txt b/src/index/vocabulary/CMakeLists.txt index 151f8ec18c..910ad61c3a 100644 --- a/src/index/vocabulary/CMakeLists.txt +++ b/src/index/vocabulary/CMakeLists.txt @@ -1,4 +1,4 @@ add_library(vocabulary VocabularyInMemory.h VocabularyInMemory.cpp VocabularyInMemoryBinSearch.cpp VocabularyInternalExternal.cpp - VocabularyVariant.cpp + PolymorphicVocabulary.cpp VocabularyType.h) qlever_target_link_libraries(vocabulary) diff --git a/src/index/vocabulary/PolymorphicVocabulary.cpp b/src/index/vocabulary/PolymorphicVocabulary.cpp new file mode 100644 index 0000000000..1b9936afee --- /dev/null +++ b/src/index/vocabulary/PolymorphicVocabulary.cpp @@ -0,0 +1,90 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include "index/vocabulary/PolymorphicVocabulary.h" + +#include + +// _____________________________________________________________________________ +void PolymorphicVocabulary::open(const std::string& filename) { + std::visit([&filename](auto& vocab) { vocab.open(filename); }, vocab_); +} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::open(const std::string& filename, + VocabularyType type) { + resetToType(type); + open(filename); +} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::close() { + std::visit([](auto& vocab) { return vocab.close(); }, vocab_); +} + +// _____________________________________________________________________________ +size_t PolymorphicVocabulary::size() const { + return std::visit([](auto& vocab) { return vocab.size(); }, vocab_); +} + +// _____________________________________________________________________________ +std::string PolymorphicVocabulary::operator[](uint64_t i) const { + return std::visit([i](auto& vocab) { return std::string{vocab[i]}; }, vocab_); +} + +// _____________________________________________________________________________ +PolymorphicVocabulary::WordWriter::WordWriter(WordWriters writer) + : writer_(std::move(writer)) {} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::WordWriter::finish() { + std::visit([](auto& writer) { return writer->finish(); }, writer_); +} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::WordWriter::operator()(std::string_view word, + bool isExternal) { + std::visit( + [&word, isExternal](auto& writer) { return (*writer)(word, isExternal); }, + writer_); +} + +// _____________________________________________________________________________ +auto PolymorphicVocabulary::makeDiskWriter(const std::string& filename) const + -> WordWriter { + return WordWriter{std::visit( + [&filename](auto& vocab) -> WordWriters { + return vocab.makeDiskWriterPtr(filename); + }, + vocab_)}; +} + +// _____________________________________________________________________________ +PolymorphicVocabulary::WordWriter PolymorphicVocabulary::makeDiskWriter( + const std::string& filename, VocabularyType type) { + PolymorphicVocabulary dummyVocab; + dummyVocab.resetToType(type); + return dummyVocab.makeDiskWriter(filename); +} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::resetToType(VocabularyType type) { + close(); + switch (type.value()) { + case VocabularyType::Enum::InMemory: + vocab_.emplace(); + break; + case VocabularyType::Enum::OnDisk: + vocab_.emplace(); + break; + case VocabularyType::Enum::CompressedInMemory: + vocab_.emplace(); + break; + case VocabularyType::Enum::CompressedOnDisk: + vocab_.emplace(); + break; + default: + AD_FAIL(); + } +} diff --git a/src/index/vocabulary/VocabularyVariant.h b/src/index/vocabulary/PolymorphicVocabulary.h similarity index 56% rename from src/index/vocabulary/VocabularyVariant.h rename to src/index/vocabulary/PolymorphicVocabulary.h index 7ec162890d..4a18e57465 100644 --- a/src/index/vocabulary/VocabularyVariant.h +++ b/src/index/vocabulary/PolymorphicVocabulary.h @@ -16,6 +16,10 @@ namespace polymorphic_vocabulary::detail { +// For `T = std::variant = +// std::variant, +// unique_ptr, ...>`. This is used in the implementation +// of the `PolymorphicVocabulary` below. template struct WriterPointers {}; @@ -25,30 +29,51 @@ struct WriterPointers> { }; } // namespace polymorphic_vocabulary::detail -class VocabularyVariant { +// A vocabulary that can at runtime choose between different vocabulary +// implementations. The only restriction is, that a vocabulary can only be read +// from disk with the same implementation that it was written to. +class PolymorphicVocabulary { public: - using VocabularyType = ad_utility::VocabularyEnum; + using VocabularyType = ad_utility::VocabularyType; private: + // Type aliases for all the currently supported vocabularies. If another + // vocabulary is added, don't forget to also register it in the + // `VocabularyType` enum. using InMemory = VocabularyInMemory; using External = VocabularyInternalExternal; using CompressedInMemory = CompressedVocabulary; using CompressedExternal = CompressedVocabulary; using Variant = std::variant; - using Tuple = - std::tuple; + // In this variant we store the actual vocabulary. Variant vocab_; public: - void resetToType(VocabularyType); - void open(const std::string& filename); + // Read a vocabulary with the given `type` from the file with the `filename`. + // A vocabulary with the corresponding `type` must have been previously + // written to that file. void open(const std::string& filename, VocabularyType type); + + // Close the vocabulary if it is open, and set the underlying vocabulary + // implementation according to the `type` without opening the vocabulary. + void resetToType(VocabularyType type); + + // Same as the overload of `open` above, but expects that the correct + // `VocabularyType` has already been set via `resetToType` above. + void open(const std::string& filename); + + // Close the vocabulary s.t. it consumes no more RAM. void close(); + + // Return the total number of words in the vocabulary. size_t size() const; + + // Return the `i`-the word, throw of `i` is out of bounds. std::string operator[](uint64_t i) const; + // Same as `std::lower_bound`, return the smallest entry >= `word`. template WordAndIndex lower_bound(const String& word, Comp comp) const { return std::visit( @@ -58,6 +83,8 @@ class VocabularyVariant { vocab_); } + // Same as `lower_bound` above, but the comparator compares a `word` and an + // `iterator` instead of two words. template WordAndIndex lower_bound_iterator(const String& word, Comp comp) const { return std::visit( @@ -67,6 +94,7 @@ class VocabularyVariant { vocab_); } + // Analogous to `lower_bound` (see above). template WordAndIndex upper_bound(const String& word, Comp comp) const { return std::visit( @@ -76,6 +104,7 @@ class VocabularyVariant { vocab_); } + // Analogous to `lower_bound_iterator` (see above). template WordAndIndex upper_bound_iterator(const String& word, Comp comp) const { return std::visit( @@ -88,18 +117,28 @@ class VocabularyVariant { using WordWriters = polymorphic_vocabulary::detail::WriterPointers::type; + // The `WordWriter` is used to write a vocabulary to disk word by word (in + // sorted order). class WordWriter { WordWriters writer_; public: + // Constructor, used by the `makeDiskWriter` functions below. explicit WordWriter(WordWriters); + // This function has to be called after the last word has been written. void finish(); + // Write the next word to the vocabulary. void operator()(std::string_view word, bool isExternal); }; - WordWriter makeDiskWriter(const std::string& filename) const; + // Create a `WordWriter` that will create a vocabulary with the given `type` + // at the given `filename`. static WordWriter makeDiskWriter(const std::string& filename, VocabularyType type); + + // Same as above, but the `VocabularyType` is the currently active type of + // `this`. + WordWriter makeDiskWriter(const std::string& filename) const; }; diff --git a/src/index/vocabulary/VocabularyType.h b/src/index/vocabulary/VocabularyType.h index 21474023a3..4e65a481df 100644 --- a/src/index/vocabulary/VocabularyType.h +++ b/src/index/vocabulary/VocabularyType.h @@ -10,7 +10,7 @@ #include "util/json.h" namespace ad_utility { -class VocabularyEnum { +class VocabularyType { public: enum struct Enum { InMemory, OnDisk, CompressedInMemory, CompressedOnDisk }; @@ -22,10 +22,10 @@ class VocabularyEnum { "on-disk-compressed"}; public: - VocabularyEnum() = default; - explicit VocabularyEnum(Enum value) : value_{value} {} + VocabularyType() = default; + explicit VocabularyType(Enum value) : value_{value} {} - static VocabularyEnum fromString(std::string_view description) { + static VocabularyType fromString(std::string_view description) { auto it = ql::ranges::find(descriptions, description); if (it == descriptions.end()) { throw std::runtime_error{ @@ -35,7 +35,7 @@ class VocabularyEnum { absl::StrJoin(descriptions, ", "))}; ; } - return VocabularyEnum{static_cast(it - descriptions.begin())}; + return VocabularyType{static_cast(it - descriptions.begin())}; } std::string_view toString() const { return descriptions.at(static_cast(value_)); @@ -44,13 +44,13 @@ class VocabularyEnum { Enum value() const { return value_; } // Conversion To JSON. - friend void to_json(nlohmann::json& j, const VocabularyEnum& vocabEnum) { + friend void to_json(nlohmann::json& j, const VocabularyType& vocabEnum) { j = vocabEnum.toString(); } // Conversion from JSON. - friend void from_json(const nlohmann::json& j, VocabularyEnum& vocabEnum) { - vocabEnum = VocabularyEnum::fromString(static_cast(j)); + friend void from_json(const nlohmann::json& j, VocabularyType& vocabEnum) { + vocabEnum = VocabularyType::fromString(static_cast(j)); } }; } // namespace ad_utility diff --git a/src/index/vocabulary/VocabularyVariant.cpp b/src/index/vocabulary/VocabularyVariant.cpp deleted file mode 100644 index 504591116e..0000000000 --- a/src/index/vocabulary/VocabularyVariant.cpp +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2025, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach - -#include "index/vocabulary/VocabularyVariant.h" - -#include - -void VocabularyVariant::open(const std::string& filename) { - std::visit([&filename](auto& vocab) { vocab.open(filename); }, vocab_); -} - -void VocabularyVariant::open(const std::string& filename, VocabularyType type) { - resetToType(type); - open(filename); -} - -void VocabularyVariant::close() { - return std::visit([](auto& vocab) { return vocab.close(); }, vocab_); -} -size_t VocabularyVariant::size() const { - return std::visit([](auto& vocab) { return vocab.size(); }, vocab_); -} -std::string VocabularyVariant::operator[](uint64_t i) const { - return std::visit([i](auto& vocab) { return std::string{vocab[i]}; }, vocab_); -} - -VocabularyVariant::WordWriter::WordWriter(WordWriters writer) - : writer_(std::move(writer)) {} - -void VocabularyVariant::WordWriter::finish() { - std::visit([](auto& writer) { return writer->finish(); }, writer_); -} - -void VocabularyVariant::WordWriter::operator()(std::string_view word, - bool isExternal) { - std::visit( - [&word, isExternal](auto& writer) { return (*writer)(word, isExternal); }, - writer_); -} - -auto VocabularyVariant::makeDiskWriter(const std::string& filename) const - -> WordWriter { - return WordWriter{std::visit( - [&filename](auto& vocab) -> WordWriters { - return vocab.makeDiskWriterPtr(filename); - }, - vocab_)}; -} - -VocabularyVariant::WordWriter VocabularyVariant::makeDiskWriter( - const std::string& filename, VocabularyType type) { - VocabularyVariant dummyVocab; - dummyVocab.resetToType(type); - return dummyVocab.makeDiskWriter(filename); -} - -void VocabularyVariant::resetToType(VocabularyType type) { - close(); - switch (type.value()) { - case VocabularyType::Enum::InMemory: - vocab_.emplace(); - break; - case VocabularyType::Enum::OnDisk: - vocab_.emplace(); - break; - case VocabularyType::Enum::CompressedInMemory: - vocab_.emplace(); - break; - case VocabularyType::Enum::CompressedOnDisk: - vocab_.emplace(); - break; - default: - AD_FAIL(); - } -} diff --git a/src/util/ProgramOptionsHelpers.h b/src/util/ProgramOptionsHelpers.h index 0d3ede6a1a..a86a850c35 100644 --- a/src/util/ProgramOptionsHelpers.h +++ b/src/util/ProgramOptionsHelpers.h @@ -8,11 +8,10 @@ #include #include +#include "index/vocabulary/VocabularyType.h" #include "util/Concepts.h" #include "util/MemorySize/MemorySize.h" #include "util/Parameters.h" -// TODO only include the enum. -#include "index/vocabulary/VocabularyVariant.h" namespace ad_utility { // An implicit wrapper that can be implicitly converted to and from `size_t`. @@ -124,7 +123,7 @@ class ParameterToProgramOptionFactory { // This function is required to use `VocabularyEnum` in // `boost::program_options`. inline void validate(boost::any& v, const std::vector& values, - VocabularyEnum*, int) { + VocabularyType*, int) { using namespace boost::program_options; // Make sure no previous assignment to 'v' was made. @@ -134,7 +133,7 @@ inline void validate(boost::any& v, const std::vector& values, const string& s = validators::get_single_string(values); // Convert the string to `MemorySize` and put it into the option. - v = VocabularyEnum::fromString(s); + v = VocabularyType::fromString(s); } } // namespace ad_utility From 825f8bfb754ef2e83e1f6aed374207aa9b331d35 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 5 Feb 2025 19:09:04 +0100 Subject: [PATCH 07/13] Some additional fixes and comments. Signed-off-by: Johannes Kalmbach --- src/global/Pattern.h | 3 --- src/index/IndexBuilderMain.cpp | 9 ++++----- src/index/IndexImpl.cpp | 2 -- src/index/IndexImpl.h | 2 ++ src/index/vocabulary/VocabularyType.h | 7 +++++-- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 9c37eb39ce..1005add22d 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -194,9 +194,6 @@ struct CompactStringVectorWriter { commonInitialization(); } - CompactStringVectorWriter(CompactStringVectorWriter&&) = default; - CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default; - void push(const data_type* data, size_t elementSize) { AD_CONTRACT_CHECK(!_finished); _offsets.push_back(_nextOffset); diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index c75fd5d427..29b11eae9b 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -168,7 +168,6 @@ int main(int argc, char** argv) { std::optional stxxlMemory; std::optional parserBufferSize; std::optional vocabType; - // VocabularyEnum vocabType; optind = 1; Index index{ad_utility::makeUnlimitedAllocator()}; @@ -228,8 +227,9 @@ int main(int argc, char** argv) { "Only build the PSO and POS permutations. This is faster, but then " "queries with predicate variables are not supported"); add("vocabulary-type", po::value(&vocabType), - "The vocabulary implementation for strings in qlever, can be any of ... " - "(TODO joka)"); + absl::StrCat( + "The vocabulary implementation for strings in qlever, can be any of ", + ad_utility::VocabularyType::getListOfSupportedValues())); // Options for the index building process. add("stxxl-memory,m", po::value(&stxxlMemory), @@ -262,11 +262,10 @@ int main(int argc, char** argv) { if (parserBufferSize.has_value()) { index.parserBufferSize() = parserBufferSize.value(); } - /* + if (vocabType.has_value()) { index.getImpl().setVocabularyTypeForIndexBuilding(vocabType.value()); } - */ // If no text index name was specified, take the part of the wordsfile after // the last slash. diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 3ad2e997ec..9d8f89c19f 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1133,8 +1133,6 @@ void IndexImpl::readConfiguration() { loadDataMember("num-triples", numTriples_, NumNormalAndInternal{}); loadDataMember("num-non-literals-text-index", nofNonLiteralsInTextIndex_, 0); - // TODO Comment and also write the configuration. - // The default value is the one the used to be the only. ad_utility::VocabularyType vocabType( ad_utility::VocabularyType::Enum::CompressedOnDisk); loadDataMember("vocabulary-type", vocabType, vocabType); diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index aaa6d0a1f1..a8828f2236 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -192,6 +192,8 @@ class IndexImpl { std::optional idOfHasPatternDuringIndexBuilding_; std::optional idOfInternalGraphDuringIndexBuilding_; + // The vocabulary type that is used (only relevant during index building). + // The default is chosen s.t. the compatibility to old index builds. ad_utility::VocabularyType vocabularyTypeForIndexBuilding_{ ad_utility::VocabularyType::Enum::CompressedOnDisk}; diff --git a/src/index/vocabulary/VocabularyType.h b/src/index/vocabulary/VocabularyType.h index 4e65a481df..a6b0eacfb4 100644 --- a/src/index/vocabulary/VocabularyType.h +++ b/src/index/vocabulary/VocabularyType.h @@ -32,11 +32,14 @@ class VocabularyType { absl::StrCat("\"", description, "\" is not a valid vocabulary type. The currently " "supported vocabulary types are ", - absl::StrJoin(descriptions, ", "))}; - ; + getListOfSupportedValues())}; } return VocabularyType{static_cast(it - descriptions.begin())}; } + + static std::string getListOfSupportedValues() { + return absl::StrJoin(descriptions, ", "); + } std::string_view toString() const { return descriptions.at(static_cast(value_)); } From 066ddf62c50de6add776499d08f1e643750d3a71 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 5 Feb 2025 19:29:04 +0100 Subject: [PATCH 08/13] Refactoring there and back again. Signed-off-by: Johannes Kalmbach --- src/index/IndexImpl.h | 3 ++- src/index/Vocabulary.h | 18 +----------------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index a8828f2236..f3aba12cbb 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -193,7 +193,6 @@ class IndexImpl { std::optional idOfInternalGraphDuringIndexBuilding_; // The vocabulary type that is used (only relevant during index building). - // The default is chosen s.t. the compatibility to old index builds. ad_utility::VocabularyType vocabularyTypeForIndexBuilding_{ ad_utility::VocabularyType::Enum::CompressedOnDisk}; @@ -280,6 +279,8 @@ class IndexImpl { return deltaTriples_.value(); } + // See the documentation of the `vocabularyTypeForIndexBuilding_` member for + // details. void setVocabularyTypeForIndexBuilding(ad_utility::VocabularyType type) { vocabularyTypeForIndexBuilding_ = type; configurationJson_["vocabulary-type"] = type; diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index 46af9c8c56..7587275118 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -20,7 +20,6 @@ #include "global/Constants.h" #include "global/Id.h" #include "global/Pattern.h" -#include "index/CompressedString.h" #include "index/StringSortComparator.h" #include "index/VocabularyOnDisk.h" #include "index/vocabulary/CompressedVocabulary.h" @@ -60,9 +59,7 @@ inline std::ostream& operator<<(std::ostream& stream, } // A vocabulary. Wraps a vector of strings and provides additional methods for -// retrieval. Template parameters that are supported are: -// std::string -> no compression is applied -// CompressedString -> prefix compression is applied +// retrieval. template class Vocabulary { @@ -105,19 +102,6 @@ class Vocabulary { vector internalizedLangs_; vector externalizedPrefixes_{""}; - // using UnderlyingVocabulary = VocabularyInMemory; - /* - using UnderlyingVocabulary = - std::conditional_t, - VocabularyInMemory>; - */ - /* - using UnderlyingVocabulary = - std::conditional_t, - VocabularyInMemory>; - */ using VocabularyWithUnicodeComparator = UnicodeVocabulary; From b9948ff68f580224c79bfe1cb8c590da3cbce99e Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 5 Feb 2025 19:37:16 +0100 Subject: [PATCH 09/13] Fix compilation. Signed-off-by: Johannes Kalmbach --- src/global/Pattern.h | 3 +++ src/index/IndexBuilderMain.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 1005add22d..28ca2a9c0e 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -227,6 +227,9 @@ struct CompactStringVectorWriter { } } + CompactStringVectorWriter(CompactStringVectorWriter&&) = default; + CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default; + private: // Has to be run by all the constructors void commonInitialization() { diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index 29b11eae9b..034e76050d 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -226,10 +226,10 @@ int main(int argc, char** argv) { add("only-pso-and-pos-permutations,o", po::bool_switch(&onlyPsoAndPos), "Only build the PSO and POS permutations. This is faster, but then " "queries with predicate variables are not supported"); - add("vocabulary-type", po::value(&vocabType), - absl::StrCat( - "The vocabulary implementation for strings in qlever, can be any of ", - ad_utility::VocabularyType::getListOfSupportedValues())); + auto msg = absl::StrCat( + "The vocabulary implementation for strings in qlever, can be any of ", + ad_utility::VocabularyType::getListOfSupportedValues()); + add("vocabulary-type", po::value(&vocabType), msg.c_str()); // Options for the index building process. add("stxxl-memory,m", po::value(&stxxlMemory), From b1b884e4fb482b030b6b9b24ed600f332412dbfd Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 6 Feb 2025 12:15:00 +0100 Subject: [PATCH 10/13] Feed this to the tools... Signed-off-by: Johannes Kalmbach --- CMakeLists.txt | 11 ++--- src/global/Pattern.h | 5 ++- src/index/CMakeLists.txt | 2 +- src/index/Vocabulary.h | 37 ++++++--------- src/index/vocabulary/CMakeLists.txt | 4 +- src/index/vocabulary/PolymorphicVocabulary.h | 21 --------- .../vocabulary/VocabularyInternalExternal.h | 2 +- .../{ => vocabulary}/VocabularyOnDisk.cpp | 2 +- src/index/{ => vocabulary}/VocabularyOnDisk.h | 0 src/index/vocabulary/VocabularyType.h | 45 ++++++++++++++++--- src/util/File.h | 6 ++- src/util/ProgramOptionsHelpers.h | 2 +- test/CMakeLists.txt | 2 +- test/StringSortComparatorTest.cpp | 5 +++ test/index/vocabulary/CMakeLists.txt | 14 +++--- .../vocabulary/CompressedVocabularyTest.cpp | 2 +- .../vocabulary/PolymorphicVocabularyTest.cpp | 42 +++++++++++++++++ .../VocabularyInternalExternalTest.cpp | 2 +- .../index/vocabulary/VocabularyOnDiskTest.cpp | 2 +- test/index/vocabulary/VocabularyTypeTest.cpp | 36 +++++++++++++++ test/util/IndexTestHelpers.cpp | 2 + 21 files changed, 169 insertions(+), 75 deletions(-) rename src/index/{ => vocabulary}/VocabularyOnDisk.cpp (98%) rename src/index/{ => vocabulary}/VocabularyOnDisk.h (100%) create mode 100644 test/index/vocabulary/PolymorphicVocabularyTest.cpp create mode 100644 test/index/vocabulary/VocabularyTypeTest.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 67b2feb62b..9402201159 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,14 +203,9 @@ if (${USE_CPP_17_BACKPORTS}) add_definitions("-DQLEVER_CPP_17 -DCPP_CXX_CONCEPTS=0") endif() -set(VOCAB_IN_MEMORY OFF CACHE BOOL "Store QLever's vocabulary completely in RAM") -if (${VOCAB_IN_MEMORY}) - add_definitions("-D_QLEVER_VOCAB_IN_MEMORY") -endif () - -set(ENABLE_VOCAB_COMPRESSION ON CACHE BOOL "Compress the vocabulary") -if (${ENABLE_VOCAB_COMPRESSION}) - add_definitions("-D_QLEVER_ENABLE_VOCAB_COMPRESSION") +set(VOCAB_UNCOMPRESSED_IN_MEMORY OFF CACHE BOOL "Store QLever's vocabulary uncompressed and completely in RAM") +if (${VOCAB_UNCOMPRESSED_IN_MEMORY}) + add_definitions("-D_QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY") endif () # Enable the specification of additional linker flags manually from the commandline diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 28ca2a9c0e..9178e5d640 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -17,6 +17,7 @@ #include "util/File.h" #include "util/Generator.h" #include "util/Iterators.h" +#include "util/ResetWhenMoved.h" #include "util/Serializer/FileSerializer.h" #include "util/Serializer/SerializeVector.h" #include "util/TypeTraits.h" @@ -181,7 +182,9 @@ struct CompactStringVectorWriter { off_t _startOfFile; using offset_type = typename CompactVectorOfStrings::offset_type; std::vector _offsets; - bool _finished = false; + // A `CompactStringVectorWriter` that has been moved from may not call + // `finish()` any more in its destructor. + ad_utility::ResetWhenMoved _finished = false; offset_type _nextOffset = 0; explicit CompactStringVectorWriter(const std::string& filename) diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index 4a226bdfdd..e421a03e55 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -1,7 +1,7 @@ add_subdirectory(vocabulary) add_library(index Index.cpp IndexImpl.cpp IndexImpl.Text.cpp - Vocabulary.cpp VocabularyOnDisk.cpp + Vocabulary.cpp LocatedTriples.cpp Permutation.cpp TextMetaData.cpp DocsDB.cpp FTSAlgorithms.cpp PrefixHeuristic.cpp CompressedRelation.cpp diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index 7587275118..eecf3b832a 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -21,12 +21,9 @@ #include "global/Id.h" #include "global/Pattern.h" #include "index/StringSortComparator.h" -#include "index/VocabularyOnDisk.h" -#include "index/vocabulary/CompressedVocabulary.h" #include "index/vocabulary/PolymorphicVocabulary.h" #include "index/vocabulary/UnicodeVocabulary.h" #include "index/vocabulary/VocabularyInMemory.h" -#include "index/vocabulary/VocabularyInternalExternal.h" #include "util/Exception.h" #include "util/HashMap.h" #include "util/HashSet.h" @@ -216,10 +213,16 @@ class Vocabulary { // vocabulary. UnderlyingVocabulary::WordWriter makeWordWriter( const std::string& filename) const { + // Note: In GCC this triggers a move construction of the created + // `DiskWriter`, although mandatory copy elision should kick in here + // according to our understanding (and does in clang). We could investigate + // whether this is a bug in GCC or whether we are missing something. return vocabulary_.getUnderlyingVocabulary().makeDiskWriter(filename); } - // TODO Comment. + // If the `UnderlyingVocabulary` is a `PolymorphicVocabulary`, close the + // vocabulary and set the type of the vocabulary according to the `type` + // argument (see the `PolymorphicVocabulary` class for details). void resetToType(ad_utility::VocabularyType type) { if constexpr (std::is_same_v) { vocabulary_.getUnderlyingVocabulary().resetToType(type); @@ -228,29 +231,17 @@ class Vocabulary { }; namespace detail { -// The two mactors `_QLEVER_VOCAB_IN_MEMORY` and -// `_QLEVER_ENABLE_VOCAB_COMPRESSION` can be used to disable the external vocab -// and the compression of the vocab at compile time. NOTE: These change the -// binary format of QLever's index, so changing them requires rebuilding of the -// indices. -/* -#ifdef _QLEVER_VOCAB_IN_MEMORY -using VocabStorage = VocabularyInMemory; -#else -using VocabStorage = VocabularyInternalExternal; -#endif -*/ +// Thecompile-time definitions `_QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY` can be +// used to disable the external vocab and the compression of the vocab at +// compile time. NOTE: These change the binary format of QLever's index, so +// changing them requires rebuilding of the indices. -/* -#ifndef _QLEVER_ENABLE_VOCAB_COMPRESSION -using UnderlyingVocabRdfsVocabulary = VocabStorage; +#ifdef _QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY +using UnderlyingVocabRdfsVocabulary = VocabularyInMemory; #else -using UnderlyingVocabRdfsVocabulary = CompressedVocabulary; +using UnderlyingVocabRdfsVocabulary = PolymorphicVocabulary; #endif -*/ -// TODO Change this place. -using UnderlyingVocabRdfsVocabulary = PolymorphicVocabulary; using UnderlyingVocabTextVocabulary = VocabularyInMemory; } // namespace detail diff --git a/src/index/vocabulary/CMakeLists.txt b/src/index/vocabulary/CMakeLists.txt index 910ad61c3a..ce746097da 100644 --- a/src/index/vocabulary/CMakeLists.txt +++ b/src/index/vocabulary/CMakeLists.txt @@ -1,4 +1,4 @@ add_library(vocabulary VocabularyInMemory.h VocabularyInMemory.cpp VocabularyInMemoryBinSearch.cpp VocabularyInternalExternal.cpp - PolymorphicVocabulary.cpp - VocabularyType.h) + PolymorphicVocabulary.cpp VocabularyOnDisk.cpp + ) qlever_target_link_libraries(vocabulary) diff --git a/src/index/vocabulary/PolymorphicVocabulary.h b/src/index/vocabulary/PolymorphicVocabulary.h index 4a18e57465..02fa12b962 100644 --- a/src/index/vocabulary/PolymorphicVocabulary.h +++ b/src/index/vocabulary/PolymorphicVocabulary.h @@ -83,17 +83,6 @@ class PolymorphicVocabulary { vocab_); } - // Same as `lower_bound` above, but the comparator compares a `word` and an - // `iterator` instead of two words. - template - WordAndIndex lower_bound_iterator(const String& word, Comp comp) const { - return std::visit( - [&word, &comp](auto& vocab) { - return vocab.lower_bound_iterator(word, std::move(comp)); - }, - vocab_); - } - // Analogous to `lower_bound` (see above). template WordAndIndex upper_bound(const String& word, Comp comp) const { @@ -104,16 +93,6 @@ class PolymorphicVocabulary { vocab_); } - // Analogous to `lower_bound_iterator` (see above). - template - WordAndIndex upper_bound_iterator(const String& word, Comp comp) const { - return std::visit( - [&word, &comp](auto& vocab) { - return vocab.upper_bound_iterator(word, std::move(comp)); - }, - vocab_); - } - using WordWriters = polymorphic_vocabulary::detail::WriterPointers::type; diff --git a/src/index/vocabulary/VocabularyInternalExternal.h b/src/index/vocabulary/VocabularyInternalExternal.h index 897b29258d..209820c604 100644 --- a/src/index/vocabulary/VocabularyInternalExternal.h +++ b/src/index/vocabulary/VocabularyInternalExternal.h @@ -8,8 +8,8 @@ #include #include -#include "index/VocabularyOnDisk.h" #include "index/vocabulary/VocabularyInMemoryBinSearch.h" +#include "index/vocabulary/VocabularyOnDisk.h" #include "index/vocabulary/VocabularyTypes.h" #include "util/Exception.h" diff --git a/src/index/VocabularyOnDisk.cpp b/src/index/vocabulary/VocabularyOnDisk.cpp similarity index 98% rename from src/index/VocabularyOnDisk.cpp rename to src/index/vocabulary/VocabularyOnDisk.cpp index 251130be26..1dc53e8453 100644 --- a/src/index/VocabularyOnDisk.cpp +++ b/src/index/vocabulary/VocabularyOnDisk.cpp @@ -2,7 +2,7 @@ // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach -#include "index/VocabularyOnDisk.h" +#include "index/vocabulary/VocabularyOnDisk.h" #include diff --git a/src/index/VocabularyOnDisk.h b/src/index/vocabulary/VocabularyOnDisk.h similarity index 100% rename from src/index/VocabularyOnDisk.h rename to src/index/vocabulary/VocabularyOnDisk.h diff --git a/src/index/vocabulary/VocabularyType.h b/src/index/vocabulary/VocabularyType.h index a6b0eacfb4..62036a495e 100644 --- a/src/index/vocabulary/VocabularyType.h +++ b/src/index/vocabulary/VocabularyType.h @@ -7,45 +7,72 @@ #include #include +#include "util/Random.h" #include "util/json.h" namespace ad_utility { + +// A lightweight enum for the different implementation strategies of the +// `PolymorphicVocabulary`. Also includes operations for conversion to and from +// string. +// TODO Implement a generic mixin that can also be used for other +// enums, especially such used in command-line interfaces. class VocabularyType { public: + // The different vocabulary implementations; enum struct Enum { InMemory, OnDisk, CompressedInMemory, CompressedOnDisk }; private: Enum value_ = Enum::InMemory; - static constexpr std::array descriptions{ + static constexpr size_t numValues_ = 4; + // All possible values. + static constexpr std::array all_{ + Enum::InMemory, Enum::OnDisk, Enum::CompressedInMemory, + Enum::CompressedOnDisk}; + + // The string representations of the enum values. + static constexpr std::array descriptions_{ "in-memory-uncompressed", "on-disk-uncompressed", "in-memory-compressed", "on-disk-compressed"}; + static_assert(all_.size() == descriptions_.size()); + public: + // Constructors VocabularyType() = default; explicit VocabularyType(Enum value) : value_{value} {} + // Create from a string. The string must be one of the `descriptions_`, + // otherwise a `runtime_error_` is thrown. static VocabularyType fromString(std::string_view description) { - auto it = ql::ranges::find(descriptions, description); - if (it == descriptions.end()) { + auto it = ql::ranges::find(descriptions_, description); + if (it == descriptions_.end()) { throw std::runtime_error{ absl::StrCat("\"", description, "\" is not a valid vocabulary type. The currently " "supported vocabulary types are ", getListOfSupportedValues())}; } - return VocabularyType{static_cast(it - descriptions.begin())}; + return VocabularyType{all().at(it - descriptions_.begin())}; } + // Return all the possible enum values as a comma-separated single string. static std::string getListOfSupportedValues() { - return absl::StrJoin(descriptions, ", "); + return absl::StrJoin(descriptions_, ", "); } + + // Convert the enum to the corresponding string. std::string_view toString() const { - return descriptions.at(static_cast(value_)); + return descriptions_.at(static_cast(value_)); } + // Return the actual enum value. Enum value() const { return value_; } + // Return a list of all the enum values. + static constexpr const std::array& all() { return all_; } + // Conversion To JSON. friend void to_json(nlohmann::json& j, const VocabularyType& vocabEnum) { j = vocabEnum.toString(); @@ -55,5 +82,11 @@ class VocabularyType { friend void from_json(const nlohmann::json& j, VocabularyType& vocabEnum) { vocabEnum = VocabularyType::fromString(static_cast(j)); } + + // Get a random value, useful for fuzz testing. + static VocabularyType random() { + ad_utility::FastRandomIntGenerator r; + return VocabularyType{static_cast(r() % numValues_)}; + } }; } // namespace ad_utility diff --git a/src/util/File.h b/src/util/File.h index cde77a4aaf..782e266380 100644 --- a/src/util/File.h +++ b/src/util/File.h @@ -52,6 +52,10 @@ class File { open(filename, mode); } + // Files are move-only types. + File(const File&) = delete; + File& operator=(const File&) = delete; + File& operator=(File&& rhs) noexcept { if (isOpen()) { close(); @@ -63,7 +67,7 @@ class File { return *this; } - File(File&& rhs) : name_{std::move(rhs.name_)}, file_{rhs.file_} { + File(File&& rhs) noexcept : name_{std::move(rhs.name_)}, file_{rhs.file_} { rhs.file_ = nullptr; } diff --git a/src/util/ProgramOptionsHelpers.h b/src/util/ProgramOptionsHelpers.h index a86a850c35..b395768f50 100644 --- a/src/util/ProgramOptionsHelpers.h +++ b/src/util/ProgramOptionsHelpers.h @@ -55,7 +55,7 @@ void validate(boost::any& v, const std::vector& values, std::optional*, int) { // First parse as a T T* dummy = nullptr; - // using namespace boost::program_options; + using namespace boost::program_options; validate(v, values, dummy, 0); // Wrap the T inside std::optional diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3a04b9d201..994b4ea9ae 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -201,7 +201,7 @@ addLinkAndDiscoverTest(BatchedPipelineTest) addLinkAndDiscoverTest(TupleHelpersTest) -addLinkAndDiscoverTest(StringSortComparatorTest) +addLinkAndDiscoverTestNoLibs(StringSortComparatorTest) addLinkAndDiscoverTest(PriorityQueueTest) diff --git a/test/StringSortComparatorTest.cpp b/test/StringSortComparatorTest.cpp index ade2178ae0..b6143ec70f 100644 --- a/test/StringSortComparatorTest.cpp +++ b/test/StringSortComparatorTest.cpp @@ -125,6 +125,11 @@ TEST(StringSortComparatorTest, TripleComponentComparatorTotal) { auto bSplit = comparator.extractAndTransformComparable( b, TripleComponentComparator::Level::TOTAL); EXPECT_EQ(ab, comp(aSplit, bSplit)); + EXPECT_EQ(ab, comp(a, bSplit)); + EXPECT_EQ(ab, comp(aSplit, b)); + + EXPECT_EQ(ba, comp(b, aSplit)); + EXPECT_EQ(ba, comp(bSplit, a)); EXPECT_EQ(ba, comp(bSplit, aSplit)); }; diff --git a/test/index/vocabulary/CMakeLists.txt b/test/index/vocabulary/CMakeLists.txt index 3b4499a751..2db01bd594 100644 --- a/test/index/vocabulary/CMakeLists.txt +++ b/test/index/vocabulary/CMakeLists.txt @@ -1,11 +1,15 @@ -addLinkAndDiscoverTest(VocabularyInMemoryTest vocabulary) +addLinkAndDiscoverTestNoLibs(VocabularyInMemoryTest vocabulary) -addLinkAndDiscoverTest(VocabularyOnDiskTest index) +addLinkAndDiscoverTestNoLibs(VocabularyOnDiskTest index) addLinkAndDiscoverTest(CompressedVocabularyTest vocabulary) -addLinkAndDiscoverTest(UnicodeVocabularyTest vocabulary) +addLinkAndDiscoverTestNoLibs(UnicodeVocabularyTest vocabulary) -addLinkAndDiscoverTest(VocabularyInternalExternalTest vocabulary) +addLinkAndDiscoverTestNoLibs(VocabularyInternalExternalTest vocabulary) -addLinkAndDiscoverTest(VocabularyInMemoryBinSearchTest vocabulary) +addLinkAndDiscoverTestNoLibs(VocabularyInMemoryBinSearchTest vocabulary) + +addLinkAndDiscoverTestNoLibs(PolymorphicVocabularyTest vocabulary) + +addLinkAndDiscoverTestNoLibs(VocabularyTypeTest) diff --git a/test/index/vocabulary/CompressedVocabularyTest.cpp b/test/index/vocabulary/CompressedVocabularyTest.cpp index a1a445e213..8a6f39d2bb 100644 --- a/test/index/vocabulary/CompressedVocabularyTest.cpp +++ b/test/index/vocabulary/CompressedVocabularyTest.cpp @@ -6,10 +6,10 @@ #include "VocabularyTestHelpers.h" #include "backports/algorithm.h" -#include "index/VocabularyOnDisk.h" #include "index/vocabulary/CompressedVocabulary.h" #include "index/vocabulary/PrefixCompressor.h" #include "index/vocabulary/VocabularyInMemory.h" +#include "index/vocabulary/VocabularyOnDisk.h" namespace { diff --git a/test/index/vocabulary/PolymorphicVocabularyTest.cpp b/test/index/vocabulary/PolymorphicVocabularyTest.cpp new file mode 100644 index 0000000000..fc01104d4c --- /dev/null +++ b/test/index/vocabulary/PolymorphicVocabularyTest.cpp @@ -0,0 +1,42 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include + +#include "index/vocabulary/PolymorphicVocabulary.h" + +using ad_utility::VocabularyType; + +namespace { +void testForVocabType(VocabularyType::Enum vocabType) { + VocabularyType type{vocabType}; + std::string filename = + absl::StrCat("polymorphicVocabularyTest.", type.toString(), ".vocab"); + + auto writer = PolymorphicVocabulary::makeDiskWriter(filename, type); + writer("alpha", false); + writer("beta", true); + writer("gamma", false); + writer.finish(); + + PolymorphicVocabulary vocab; + vocab.open(filename, type); + EXPECT_EQ(vocab.size(), 3); + + EXPECT_EQ(vocab[0], "alpha"); + EXPECT_EQ(vocab[1], "beta"); + EXPECT_EQ(vocab[2], "gamma"); + + auto wI = vocab.lower_bound("alx", ql::ranges::less{}); + EXPECT_EQ(wI.index(), 1); + EXPECT_EQ(wI.word(), "beta"); + + wI = vocab.upper_bound("gamma", ql::ranges::less{}); + EXPECT_TRUE(wI.isEnd()); +} +} // namespace + +TEST(PolymorphicVocabulary, basicTests) { + ql::ranges::for_each(VocabularyType::all(), &testForVocabType); +} diff --git a/test/index/vocabulary/VocabularyInternalExternalTest.cpp b/test/index/vocabulary/VocabularyInternalExternalTest.cpp index 6c41dc415a..08ef9164dc 100644 --- a/test/index/vocabulary/VocabularyInternalExternalTest.cpp +++ b/test/index/vocabulary/VocabularyInternalExternalTest.cpp @@ -34,7 +34,7 @@ class VocabularyCreator { auto createVocabularyImpl(const std::vector& words) { VocabularyInternalExternal vocabulary; { - auto writer = VocabularyInternalExternal::WordWriter(vocabFilename_); + auto writer = VocabularyInternalExternal::makeDiskWriter(vocabFilename_); size_t i = 0; for (auto& word : words) { writer(word, i % 2 == 0); diff --git a/test/index/vocabulary/VocabularyOnDiskTest.cpp b/test/index/vocabulary/VocabularyOnDiskTest.cpp index 54fc934f24..ee9090125e 100644 --- a/test/index/vocabulary/VocabularyOnDiskTest.cpp +++ b/test/index/vocabulary/VocabularyOnDiskTest.cpp @@ -5,7 +5,7 @@ #include #include "./VocabularyTestHelpers.h" -#include "index/VocabularyOnDisk.h" +#include "index/vocabulary/VocabularyOnDisk.h" #include "util/Forward.h" namespace { diff --git a/test/index/vocabulary/VocabularyTypeTest.cpp b/test/index/vocabulary/VocabularyTypeTest.cpp new file mode 100644 index 0000000000..2a8281dd80 --- /dev/null +++ b/test/index/vocabulary/VocabularyTypeTest.cpp @@ -0,0 +1,36 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include + +#include "index/vocabulary/VocabularyType.h" + +using namespace ad_utility; +TEST(VocabularyType, allTests) { + using E = VocabularyType::Enum; + using T = VocabularyType; + T t{}; + EXPECT_EQ(t.value(), E::InMemory); + for (auto e : T::all()) { + EXPECT_EQ(T{e}.value(), e); + } + + t = T::fromString("on-disk-compressed"); + EXPECT_EQ(t.value(), E::CompressedOnDisk); + + EXPECT_ANY_THROW(T::fromString("kartoffelsalat")); + + EXPECT_EQ(T{E::OnDisk}.toString(), "on-disk-uncompressed"); + + using namespace ::testing; + EXPECT_THAT(T::getListOfSupportedValues(), + AllOf(HasSubstr("in-memory-uncompressed"), + HasSubstr(", on-disk-uncompressed"))); + + for (auto e : T::all()) { + nlohmann::json j = T{e}; + t = j.get(); + EXPECT_EQ(t.value(), e); + } +} diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index 8e1a693209..6cc5724690 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -186,6 +186,8 @@ Index makeTestIndex(const std::string& indexBasename, index.loadAllPermutations() = loadAllPermutations; qlever::InputFileSpecification spec{inputFilename, qlever::Filetype::Turtle, std::nullopt}; + // randomly choose one of the vocabulary implementations + index.getImpl().setVocabularyTypeForIndexBuilding(VocabularyType::random()); index.createFromFiles({spec}); if (createTextIndex) { if (contentsOfWordsFileAndDocsFile.has_value()) { From 5f2ec6c2ca2850691bd66a80ae0f1d2db97f5ba7 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 6 Feb 2025 12:51:15 +0100 Subject: [PATCH 11/13] Fix for MacOS... Signed-off-by: Johannes Kalmbach --- src/index/vocabulary/PolymorphicVocabulary.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index/vocabulary/PolymorphicVocabulary.cpp b/src/index/vocabulary/PolymorphicVocabulary.cpp index 1b9936afee..1c328dc6aa 100644 --- a/src/index/vocabulary/PolymorphicVocabulary.cpp +++ b/src/index/vocabulary/PolymorphicVocabulary.cpp @@ -25,7 +25,7 @@ void PolymorphicVocabulary::close() { // _____________________________________________________________________________ size_t PolymorphicVocabulary::size() const { - return std::visit([](auto& vocab) { return vocab.size(); }, vocab_); + return std::visit([](auto& vocab) -> size_t { return vocab.size(); }, vocab_); } // _____________________________________________________________________________ From d8080b30f9914a89e3ed3dcda9c9ccf85a880795 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 6 Feb 2025 17:33:33 +0100 Subject: [PATCH 12/13] Many more improvements for the tests and for the tools. Signed-off-by: Johannes Kalmbach --- src/global/Pattern.h | 17 ++++++++++++++++- src/index/vocabulary/VocabularyOnDisk.cpp | 6 +++--- src/index/vocabulary/VocabularyOnDisk.h | 4 ++-- .../vocabulary/PolymorphicVocabularyTest.cpp | 11 +++++++++++ 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 9178e5d640..c98487e772 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -182,6 +182,7 @@ struct CompactStringVectorWriter { off_t _startOfFile; using offset_type = typename CompactVectorOfStrings::offset_type; std::vector _offsets; + // A `CompactStringVectorWriter` that has been moved from may not call // `finish()` any more in its destructor. ad_utility::ResetWhenMoved _finished = false; @@ -230,6 +231,16 @@ struct CompactStringVectorWriter { } } + // The copy operations would be deleted implicitly (because `File` is not + // copyable. + CompactStringVectorWriter(const CompactStringVectorWriter&) = delete; + CompactStringVectorWriter& operator=(const CompactStringVectorWriter&) = + delete; + + // The move operations have to be explicitly defaulted, because we have a + // manually defined destructor. + // Note: The defaulted move operations behave correctly because of the usage + // of `ResetWhenMoved` with the `_finished` member. CompactStringVectorWriter(CompactStringVectorWriter&&) = default; CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default; @@ -237,12 +248,16 @@ struct CompactStringVectorWriter { // Has to be run by all the constructors void commonInitialization() { AD_CONTRACT_CHECK(_file.isOpen()); - // We don't known the data size yet. + // We don't know the data size yet. _startOfFile = _file.tell(); size_t dataSizeDummy = 0; _file.write(&dataSizeDummy, sizeof(dataSizeDummy)); } }; +static_assert( + std::is_nothrow_move_assignable_v>); +static_assert( + std::is_nothrow_move_constructible_v>); } // namespace detail // Forward iterator for a `CompactVectorOfStrings` that reads directly from diff --git a/src/index/vocabulary/VocabularyOnDisk.cpp b/src/index/vocabulary/VocabularyOnDisk.cpp index 1dc53e8453..8f23170300 100644 --- a/src/index/vocabulary/VocabularyOnDisk.cpp +++ b/src/index/vocabulary/VocabularyOnDisk.cpp @@ -23,8 +23,8 @@ OffsetAndSize VocabularyOnDisk::getOffsetAndSize(uint64_t i) const { std::string VocabularyOnDisk::operator[](uint64_t idx) const { AD_CONTRACT_CHECK(idx < size()); auto offsetAndSize = getOffsetAndSize(idx); - string result(offsetAndSize._size, '\0'); - file_.read(result.data(), offsetAndSize._size, offsetAndSize._offset); + string result(offsetAndSize.size_, '\0'); + file_.read(result.data(), offsetAndSize.size_, offsetAndSize.offset_); return result; } @@ -88,7 +88,7 @@ VocabularyOnDisk::WordWriter::~WordWriter() { void VocabularyOnDisk::buildFromStringsAndIds( const std::vector>& wordsAndIds, const std::string& fileName) { - return buildFromIterable(wordsAndIds, fileName); + buildFromIterable(wordsAndIds, fileName); } // _____________________________________________________________________________ diff --git a/src/index/vocabulary/VocabularyOnDisk.h b/src/index/vocabulary/VocabularyOnDisk.h index f677ac3e7a..87506a4ed5 100644 --- a/src/index/vocabulary/VocabularyOnDisk.h +++ b/src/index/vocabulary/VocabularyOnDisk.h @@ -86,8 +86,8 @@ class VocabularyOnDisk : public VocabularyBinarySearchMixin { // The offset of a word in `file_` and its size in number of bytes. struct OffsetAndSize { - uint64_t _offset; - uint64_t _size; + uint64_t offset_; + uint64_t size_; }; // Helper function for implementing a random access iterator. diff --git a/test/index/vocabulary/PolymorphicVocabularyTest.cpp b/test/index/vocabulary/PolymorphicVocabularyTest.cpp index fc01104d4c..c5c91ed686 100644 --- a/test/index/vocabulary/PolymorphicVocabularyTest.cpp +++ b/test/index/vocabulary/PolymorphicVocabularyTest.cpp @@ -9,6 +9,8 @@ using ad_utility::VocabularyType; namespace { + +// Test a `PolymorphicVocabulary` with a given `vocabType`. void testForVocabType(VocabularyType::Enum vocabType) { VocabularyType type{vocabType}; std::string filename = @@ -37,6 +39,15 @@ void testForVocabType(VocabularyType::Enum vocabType) { } } // namespace +// Test the general functionality of the `PolymorphicVocabulary` for all the +// possible `VocabularyType`s. TEST(PolymorphicVocabulary, basicTests) { ql::ranges::for_each(VocabularyType::all(), &testForVocabType); } + +// Test a corner case in a `switch` statement. +TEST(PolymorphicVocabulary, invalidVocabularyType) { + PolymorphicVocabulary vocab; + auto invalidType = VocabularyType{static_cast(23401)}; + EXPECT_ANY_THROW(vocab.resetToType(invalidType)); +} From b2b71c2eee333e9b3649655638322d8f22ad2985 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 19 Feb 2025 08:40:51 +0100 Subject: [PATCH 13/13] Merge in the master, and fix some sonarcloud things. Signed-off-by: Johannes Kalmbach --- src/global/Pattern.h | 104 +++++++++--------- src/index/IndexImpl.cpp | 2 +- src/index/IndexImpl.h | 2 +- .../vocabulary/PolymorphicVocabulary.cpp | 8 +- src/index/vocabulary/VocabularyType.h | 13 ++- test/index/vocabulary/VocabularyTypeTest.cpp | 7 +- 6 files changed, 70 insertions(+), 66 deletions(-) diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 51ba94220f..9495f02206 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -40,8 +40,8 @@ struct Pattern { using ref = value_type&; using const_ref = const value_type&; - ref operator[](const size_t pos) { return _data[pos]; } - const_ref operator[](const size_t pos) const { return _data[pos]; } + ref operator[](const size_t pos) { return data_[pos]; } + const_ref operator[](const size_t pos) const { return data_[pos]; } using const_iterator = ad_utility::IteratorForAccessOperator< Pattern, ad_utility::AccessViaBracketOperator, ad_utility::IsConst::True>; @@ -52,19 +52,19 @@ struct Pattern { bool operator==(const Pattern& other) const = default; - size_t size() const { return _data.size(); } + size_t size() const { return data_.size(); } - void push_back(value_type i) { _data.push_back(i); } + void push_back(value_type i) { data_.push_back(i); } - void clear() { _data.clear(); } + void clear() { data_.clear(); } - const_ref back() const { return _data.back(); } - ref back() { return _data.back(); } - bool empty() const { return _data.empty(); } + const_ref back() const { return data_.back(); } + ref back() { return data_.back(); } + bool empty() const { return data_.empty(); } - const value_type* data() const { return _data.data(); } + const value_type* data() const { return data_.data(); } - std::vector _data; + std::vector data_; }; namespace detail { @@ -114,19 +114,19 @@ class CompactVectorOfStrings { static_assert( ad_utility::SimilarTobegin())), data_type>); // Also make room for the end offset of the last element. - _offsets.reserve(input.size() + 1); + offsets_.reserve(input.size() + 1); size_t dataSize = 0; for (const auto& element : input) { - _offsets.push_back(dataSize); + offsets_.push_back(dataSize); dataSize += element.size(); } // The last offset is the offset right after the last element. - _offsets.push_back(dataSize); + offsets_.push_back(dataSize); - _data.reserve(dataSize); + data_.reserve(dataSize); for (const auto& el : input) { - _data.insert(_data.end(), el.begin(), el.end()); + data_.insert(data_.end(), el.begin(), el.end()); } } @@ -138,9 +138,9 @@ class CompactVectorOfStrings { CompactVectorOfStrings(CompactVectorOfStrings&&) noexcept = default; // There is one more offset than the number of elements. - size_t size() const { return ready() ? _offsets.size() - 1 : 0; } + size_t size() const { return ready() ? offsets_.size() - 1 : 0; } - bool ready() const { return !_offsets.empty(); } + bool ready() const { return !offsets_.empty(); } /** * @brief operator [] @@ -149,9 +149,9 @@ class CompactVectorOfStrings { * elements stored at the pointers target. */ const value_type operator[](size_t i) const { - offset_type offset = _offsets[i]; - const data_type* ptr = _data.data() + offset; - size_t size = _offsets[i + 1] - offset; + offset_type offset = offsets_[i]; + const data_type* ptr = data_.data() + offset; + size_t size = offsets_[i + 1] - offset; return {ptr, size}; } @@ -170,13 +170,13 @@ class CompactVectorOfStrings { // Allow serialization via the ad_utility::serialization interface. AD_SERIALIZE_FRIEND_FUNCTION(CompactVectorOfStrings) { - serializer | arg._data; - serializer | arg._offsets; + serializer | arg.data_; + serializer | arg.offsets_; } private: - std::vector _data; - std::vector _offsets; + std::vector data_; + std::vector offsets_; }; namespace detail { @@ -184,52 +184,52 @@ namespace detail { // file. template struct CompactStringVectorWriter { - ad_utility::File _file; - off_t _startOfFile; + ad_utility::File file_; + off_t startOfFile_; using offset_type = typename CompactVectorOfStrings::offset_type; - std::vector _offsets; + std::vector offsets_; // A `CompactStringVectorWriter` that has been moved from may not call // `finish()` any more in its destructor. - ad_utility::ResetWhenMoved _finished = false; - offset_type _nextOffset = 0; + ad_utility::ResetWhenMoved finished_ = false; + offset_type nextOffset_ = 0; explicit CompactStringVectorWriter(const std::string& filename) - : _file{filename, "w"} { + : file_{filename, "w"} { commonInitialization(); } explicit CompactStringVectorWriter(ad_utility::File&& file) - : _file{std::move(file)} { + : file_{std::move(file)} { commonInitialization(); } void push(const data_type* data, size_t elementSize) { - AD_CONTRACT_CHECK(!_finished); - _offsets.push_back(_nextOffset); - _nextOffset += elementSize; - _file.write(data, elementSize * sizeof(data_type)); + AD_CONTRACT_CHECK(!finished_); + offsets_.push_back(nextOffset_); + nextOffset_ += elementSize; + file_.write(data, elementSize * sizeof(data_type)); } // Finish writing, and return the moved file. If the return value is // discarded, then the file will be closed immediately by the destructor of // the `File` class. ad_utility::File finish() { - if (_finished) { + if (finished_) { return {}; } - _finished = true; - _offsets.push_back(_nextOffset); - _file.seek(_startOfFile, SEEK_SET); - _file.write(&_nextOffset, sizeof(size_t)); - _file.seek(0, SEEK_END); - ad_utility::serialization::FileWriteSerializer f{std::move(_file)}; - f << _offsets; + finished_ = true; + offsets_.push_back(nextOffset_); + file_.seek(startOfFile_, SEEK_SET); + file_.write(&nextOffset_, sizeof(size_t)); + file_.seek(0, SEEK_END); + ad_utility::serialization::FileWriteSerializer f{std::move(file_)}; + f << offsets_; return std::move(f).file(); } ~CompactStringVectorWriter() { - if (!_finished) { + if (!finished_) { ad_utility::terminateIfThrows( [this]() { finish(); }, "Finishing the underlying File of a `CompactStringVectorWriter` " @@ -246,18 +246,18 @@ struct CompactStringVectorWriter { // The move operations have to be explicitly defaulted, because we have a // manually defined destructor. // Note: The defaulted move operations behave correctly because of the usage - // of `ResetWhenMoved` with the `_finished` member. + // of `ResetWhenMoved` with the `finished` member. CompactStringVectorWriter(CompactStringVectorWriter&&) = default; CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default; private: // Has to be run by all the constructors void commonInitialization() { - AD_CONTRACT_CHECK(_file.isOpen()); + AD_CONTRACT_CHECK(file_.isOpen()); // We don't know the data size yet. - _startOfFile = _file.tell(); + startOfFile_ = file_.tell(); size_t dataSizeDummy = 0; - _file.write(&dataSizeDummy, sizeof(dataSizeDummy)); + file_.write(&dataSizeDummy, sizeof(dataSizeDummy)); } }; static_assert( @@ -303,13 +303,11 @@ CompactVectorOfStrings::diskIterator(string filename) { } } -namespace std { template <> -struct hash { - std::size_t operator()(const Pattern& p) const { +struct std::hash { + std::size_t operator()(const Pattern& p) const noexcept { std::string_view s = std::string_view( - reinterpret_cast(p._data.data()), sizeof(Id) * p.size()); + reinterpret_cast(p.data_.data()), sizeof(Id) * p.size()); return hash()(s); } }; -} // namespace std diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index fed1674b10..adbbde1003 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1136,7 +1136,7 @@ void IndexImpl::readConfiguration() { loadDataMember("num-non-literals-text-index", nofNonLiteralsInTextIndex_, 0); ad_utility::VocabularyType vocabType( - ad_utility::VocabularyType::Enum::CompressedOnDisk); + ad_utility::VocabularyType::Enum::OnDiskCompressed); loadDataMember("vocabulary-type", vocabType, vocabType); vocab_.resetToType(vocabType); diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index e486612b24..a803a5983d 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -195,7 +195,7 @@ class IndexImpl { // The vocabulary type that is used (only relevant during index building). ad_utility::VocabularyType vocabularyTypeForIndexBuilding_{ - ad_utility::VocabularyType::Enum::CompressedOnDisk}; + ad_utility::VocabularyType::Enum::OnDiskCompressed}; // BlankNodeManager, initialized during `readConfiguration` std::unique_ptr blankNodeManager_{nullptr}; diff --git a/src/index/vocabulary/PolymorphicVocabulary.cpp b/src/index/vocabulary/PolymorphicVocabulary.cpp index 1c328dc6aa..27f48a7db1 100644 --- a/src/index/vocabulary/PolymorphicVocabulary.cpp +++ b/src/index/vocabulary/PolymorphicVocabulary.cpp @@ -72,16 +72,16 @@ PolymorphicVocabulary::WordWriter PolymorphicVocabulary::makeDiskWriter( void PolymorphicVocabulary::resetToType(VocabularyType type) { close(); switch (type.value()) { - case VocabularyType::Enum::InMemory: + case VocabularyType::Enum::InMemoryUncompressed: vocab_.emplace(); break; - case VocabularyType::Enum::OnDisk: + case VocabularyType::Enum::OnDiskUncompressed: vocab_.emplace(); break; - case VocabularyType::Enum::CompressedInMemory: + case VocabularyType::Enum::InMemoryCompressed: vocab_.emplace(); break; - case VocabularyType::Enum::CompressedOnDisk: + case VocabularyType::Enum::OnDiskCompressed: vocab_.emplace(); break; default: diff --git a/src/index/vocabulary/VocabularyType.h b/src/index/vocabulary/VocabularyType.h index 62036a495e..b3a9cdf245 100644 --- a/src/index/vocabulary/VocabularyType.h +++ b/src/index/vocabulary/VocabularyType.h @@ -20,16 +20,21 @@ namespace ad_utility { class VocabularyType { public: // The different vocabulary implementations; - enum struct Enum { InMemory, OnDisk, CompressedInMemory, CompressedOnDisk }; + enum struct Enum { + InMemoryUncompressed, + OnDiskUncompressed, + InMemoryCompressed, + OnDiskCompressed + }; private: - Enum value_ = Enum::InMemory; + Enum value_ = Enum::InMemoryUncompressed; static constexpr size_t numValues_ = 4; // All possible values. static constexpr std::array all_{ - Enum::InMemory, Enum::OnDisk, Enum::CompressedInMemory, - Enum::CompressedOnDisk}; + Enum::InMemoryUncompressed, Enum::OnDiskUncompressed, + Enum::InMemoryCompressed, Enum::OnDiskCompressed}; // The string representations of the enum values. static constexpr std::array descriptions_{ diff --git a/test/index/vocabulary/VocabularyTypeTest.cpp b/test/index/vocabulary/VocabularyTypeTest.cpp index 2a8281dd80..180a82e159 100644 --- a/test/index/vocabulary/VocabularyTypeTest.cpp +++ b/test/index/vocabulary/VocabularyTypeTest.cpp @@ -7,21 +7,22 @@ #include "index/vocabulary/VocabularyType.h" using namespace ad_utility; +// Simple tests for the glorified enum `VocabularyType`. TEST(VocabularyType, allTests) { using E = VocabularyType::Enum; using T = VocabularyType; T t{}; - EXPECT_EQ(t.value(), E::InMemory); + EXPECT_EQ(t.value(), E::InMemoryUncompressed); for (auto e : T::all()) { EXPECT_EQ(T{e}.value(), e); } t = T::fromString("on-disk-compressed"); - EXPECT_EQ(t.value(), E::CompressedOnDisk); + EXPECT_EQ(t.value(), E::OnDiskCompressed); EXPECT_ANY_THROW(T::fromString("kartoffelsalat")); - EXPECT_EQ(T{E::OnDisk}.toString(), "on-disk-uncompressed"); + EXPECT_EQ(T{E::OnDiskUncompressed}.toString(), "on-disk-uncompressed"); using namespace ::testing; EXPECT_THAT(T::getListOfSupportedValues(),