diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d541d8a32..8ee99ead7b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,6 +203,11 @@ if (${USE_CPP_17_BACKPORTS}) add_definitions("-DQLEVER_CPP_17 -DCPP_CXX_CONCEPTS=0") endif() +set(VOCAB_UNCOMPRESSED_IN_MEMORY OFF CACHE BOOL "Store QLever's vocabulary uncompressed and completely in RAM") +if (${VOCAB_UNCOMPRESSED_IN_MEMORY}) + add_definitions("-D_QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY") +endif () + # Enable the specification of additional linker flags manually from the commandline set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}") diff --git a/src/engine/ExportQueryExecutionTrees.cpp b/src/engine/ExportQueryExecutionTrees.cpp index f50bca9f61..86292bad8f 100644 --- a/src/engine/ExportQueryExecutionTrees.cpp +++ b/src/engine/ExportQueryExecutionTrees.cpp @@ -357,8 +357,15 @@ ExportQueryExecutionTrees::getLiteralOrIriFromVocabIndex( case Datatype::LocalVocabIndex: return localVocab.getWord(id.getLocalVocabIndex()).asLiteralOrIri(); case Datatype::VocabIndex: { - auto entity = index.indexToString(id.getVocabIndex()); - return LiteralOrIri::fromStringRepresentation(entity); + auto getEntity = [&index, id]() { + return index.indexToString(id.getVocabIndex()); + }; + // The type of entity might be `string_view` (If the vocabulary is stored + // uncompressed in RAM) or `string` (if it is on-disk, or compressed or + // both). The following code works and is efficient in all cases. In + // particular, the `std::string` constructor is compiled out because of + // RVO if `getEntity()` already returns a `string`. + return LiteralOrIri::fromStringRepresentation(std::string(getEntity())); } default: AD_FAIL(); diff --git a/src/global/Pattern.h b/src/global/Pattern.h index b69540e7c6..9495f02206 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -18,6 +18,7 @@ #include "util/File.h" #include "util/Generator.h" #include "util/Iterators.h" +#include "util/ResetWhenMoved.h" #include "util/Serializer/FileSerializer.h" #include "util/Serializer/SerializeVector.h" #include "util/TypeTraits.h" @@ -39,8 +40,8 @@ struct Pattern { using ref = value_type&; using const_ref = const value_type&; - ref operator[](const size_t pos) { return _data[pos]; } - const_ref operator[](const size_t pos) const { return _data[pos]; } + ref operator[](const size_t pos) { return data_[pos]; } + const_ref operator[](const size_t pos) const { return data_[pos]; } using const_iterator = ad_utility::IteratorForAccessOperator< Pattern, ad_utility::AccessViaBracketOperator, ad_utility::IsConst::True>; @@ -51,19 +52,19 @@ struct Pattern { bool operator==(const Pattern& other) const = default; - size_t size() const { return _data.size(); } + size_t size() const { return data_.size(); } - void push_back(value_type i) { _data.push_back(i); } + void push_back(value_type i) { data_.push_back(i); } - void clear() { _data.clear(); } + void clear() { data_.clear(); } - const_ref back() const { return _data.back(); } - ref back() { return _data.back(); } - bool empty() const { return _data.empty(); } + const_ref back() const { return data_.back(); } + ref back() { return data_.back(); } + bool empty() const { return data_.empty(); } - const value_type* data() const { return _data.data(); } + const value_type* data() const { return data_.data(); } - std::vector _data; + std::vector data_; }; namespace detail { @@ -113,19 +114,19 @@ class CompactVectorOfStrings { static_assert( ad_utility::SimilarTobegin())), data_type>); // Also make room for the end offset of the last element. - _offsets.reserve(input.size() + 1); + offsets_.reserve(input.size() + 1); size_t dataSize = 0; for (const auto& element : input) { - _offsets.push_back(dataSize); + offsets_.push_back(dataSize); dataSize += element.size(); } // The last offset is the offset right after the last element. - _offsets.push_back(dataSize); + offsets_.push_back(dataSize); - _data.reserve(dataSize); + data_.reserve(dataSize); for (const auto& el : input) { - _data.insert(_data.end(), el.begin(), el.end()); + data_.insert(data_.end(), el.begin(), el.end()); } } @@ -137,9 +138,9 @@ class CompactVectorOfStrings { CompactVectorOfStrings(CompactVectorOfStrings&&) noexcept = default; // There is one more offset than the number of elements. - size_t size() const { return ready() ? _offsets.size() - 1 : 0; } + size_t size() const { return ready() ? offsets_.size() - 1 : 0; } - bool ready() const { return !_offsets.empty(); } + bool ready() const { return !offsets_.empty(); } /** * @brief operator [] @@ -148,9 +149,9 @@ class CompactVectorOfStrings { * elements stored at the pointers target. */ const value_type operator[](size_t i) const { - offset_type offset = _offsets[i]; - const data_type* ptr = _data.data() + offset; - size_t size = _offsets[i + 1] - offset; + offset_type offset = offsets_[i]; + const data_type* ptr = data_.data() + offset; + size_t size = offsets_[i + 1] - offset; return {ptr, size}; } @@ -169,13 +170,13 @@ class CompactVectorOfStrings { // Allow serialization via the ad_utility::serialization interface. AD_SERIALIZE_FRIEND_FUNCTION(CompactVectorOfStrings) { - serializer | arg._data; - serializer | arg._offsets; + serializer | arg.data_; + serializer | arg.offsets_; } private: - std::vector _data; - std::vector _offsets; + std::vector data_; + std::vector offsets_; }; namespace detail { @@ -183,49 +184,52 @@ namespace detail { // file. template struct CompactStringVectorWriter { - ad_utility::File _file; - off_t _startOfFile; + ad_utility::File file_; + off_t startOfFile_; using offset_type = typename CompactVectorOfStrings::offset_type; - std::vector _offsets; - bool _finished = false; - offset_type _nextOffset = 0; + std::vector offsets_; + + // A `CompactStringVectorWriter` that has been moved from may not call + // `finish()` any more in its destructor. + ad_utility::ResetWhenMoved finished_ = false; + offset_type nextOffset_ = 0; explicit CompactStringVectorWriter(const std::string& filename) - : _file{filename, "w"} { + : file_{filename, "w"} { commonInitialization(); } explicit CompactStringVectorWriter(ad_utility::File&& file) - : _file{std::move(file)} { + : file_{std::move(file)} { commonInitialization(); } void push(const data_type* data, size_t elementSize) { - AD_CONTRACT_CHECK(!_finished); - _offsets.push_back(_nextOffset); - _nextOffset += elementSize; - _file.write(data, elementSize * sizeof(data_type)); + AD_CONTRACT_CHECK(!finished_); + offsets_.push_back(nextOffset_); + nextOffset_ += elementSize; + file_.write(data, elementSize * sizeof(data_type)); } // Finish writing, and return the moved file. If the return value is // discarded, then the file will be closed immediately by the destructor of // the `File` class. ad_utility::File finish() { - if (_finished) { + if (finished_) { return {}; } - _finished = true; - _offsets.push_back(_nextOffset); - _file.seek(_startOfFile, SEEK_SET); - _file.write(&_nextOffset, sizeof(size_t)); - _file.seek(0, SEEK_END); - ad_utility::serialization::FileWriteSerializer f{std::move(_file)}; - f << _offsets; + finished_ = true; + offsets_.push_back(nextOffset_); + file_.seek(startOfFile_, SEEK_SET); + file_.write(&nextOffset_, sizeof(size_t)); + file_.seek(0, SEEK_END); + ad_utility::serialization::FileWriteSerializer f{std::move(file_)}; + f << offsets_; return std::move(f).file(); } ~CompactStringVectorWriter() { - if (!_finished) { + if (!finished_) { ad_utility::terminateIfThrows( [this]() { finish(); }, "Finishing the underlying File of a `CompactStringVectorWriter` " @@ -233,16 +237,33 @@ struct CompactStringVectorWriter { } } + // The copy operations would be deleted implicitly (because `File` is not + // copyable. + CompactStringVectorWriter(const CompactStringVectorWriter&) = delete; + CompactStringVectorWriter& operator=(const CompactStringVectorWriter&) = + delete; + + // The move operations have to be explicitly defaulted, because we have a + // manually defined destructor. + // Note: The defaulted move operations behave correctly because of the usage + // of `ResetWhenMoved` with the `finished` member. + CompactStringVectorWriter(CompactStringVectorWriter&&) = default; + CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default; + private: // Has to be run by all the constructors void commonInitialization() { - AD_CONTRACT_CHECK(_file.isOpen()); - // We don't known the data size yet. - _startOfFile = _file.tell(); + AD_CONTRACT_CHECK(file_.isOpen()); + // We don't know the data size yet. + startOfFile_ = file_.tell(); size_t dataSizeDummy = 0; - _file.write(&dataSizeDummy, sizeof(dataSizeDummy)); + file_.write(&dataSizeDummy, sizeof(dataSizeDummy)); } }; +static_assert( + std::is_nothrow_move_assignable_v>); +static_assert( + std::is_nothrow_move_constructible_v>); } // namespace detail // Forward iterator for a `CompactVectorOfStrings` that reads directly from @@ -282,13 +303,11 @@ CompactVectorOfStrings::diskIterator(string filename) { } } -namespace std { template <> -struct hash { - std::size_t operator()(const Pattern& p) const { +struct std::hash { + std::size_t operator()(const Pattern& p) const noexcept { std::string_view s = std::string_view( - reinterpret_cast(p._data.data()), sizeof(Id) * p.size()); + reinterpret_cast(p.data_.data()), sizeof(Id) * p.size()); return hash()(s); } }; -} // namespace std diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index 4a226bdfdd..e421a03e55 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -1,7 +1,7 @@ add_subdirectory(vocabulary) add_library(index Index.cpp IndexImpl.cpp IndexImpl.Text.cpp - Vocabulary.cpp VocabularyOnDisk.cpp + Vocabulary.cpp LocatedTriples.cpp Permutation.cpp TextMetaData.cpp DocsDB.cpp FTSAlgorithms.cpp PrefixHeuristic.cpp CompressedRelation.cpp diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h index d7c1802969..4ca58f3e80 100644 --- a/src/index/ConstantsIndexBuilding.h +++ b/src/index/ConstantsIndexBuilding.h @@ -99,7 +99,8 @@ constinit inline std::atomic BUFFER_SIZE_PARTIAL_TO_GLOBAL_ID_MAPPINGS = // the overhead of the metadata that has to be stored per block becomes // infeasible. 250K seems to be a reasonable tradeoff here. constexpr inline ad_utility::MemorySize - UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN = 250_kB; + UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN = + ad_utility::MemorySize::kilobytes(250); constexpr inline size_t NumColumnsIndexBuilding = 4; diff --git a/src/index/Index.cpp b/src/index/Index.cpp index f66914bfca..06350e1e26 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -71,12 +71,13 @@ size_t Index::getCardinality( } // ____________________________________________________________________________ -std::string Index::indexToString(VocabIndex id) const { +auto Index::indexToString(VocabIndex id) const -> Vocab::AccessReturnType { return pimpl_->indexToString(id); } // ____________________________________________________________________________ -std::string_view Index::indexToString(WordVocabIndex id) const { +auto Index::indexToString(WordVocabIndex id) const + -> TextVocabulary::AccessReturnType { return pimpl_->indexToString(id); } diff --git a/src/index/Index.h b/src/index/Index.h index 8c6dd1cd40..101908ab7e 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -104,13 +104,11 @@ class Index { // Read necessary metadata into memory and open file handles. void addTextFromOnDiskIndex(); - using Vocab = - Vocabulary; + using Vocab = RdfsVocabulary; [[nodiscard]] const Vocab& getVocab() const; Vocab& getNonConstVocabForTesting(); - using TextVocab = - Vocabulary; + using TextVocab = TextVocabulary; [[nodiscard]] const TextVocab& getTextVocab() const; // Get a (non-owning) pointer to the BlankNodeManager of this Index. @@ -132,8 +130,8 @@ class Index { // TODO Once we have an overview over the folding this logic should // probably not be in the index class. - std::string indexToString(VocabIndex id) const; - std::string_view indexToString(WordVocabIndex id) const; + Vocab::AccessReturnType indexToString(VocabIndex id) const; + TextVocab::AccessReturnType indexToString(WordVocabIndex id) const; [[nodiscard]] Vocab::PrefixRanges prefixRanges(std::string_view prefix) const; diff --git a/src/index/IndexBuilderMain.cpp b/src/index/IndexBuilderMain.cpp index cfc121a2d1..034e76050d 100644 --- a/src/index/IndexBuilderMain.cpp +++ b/src/index/IndexBuilderMain.cpp @@ -11,6 +11,7 @@ #include #include "CompilationInfo.h" +#include "IndexImpl.h" #include "global/Constants.h" #include "index/ConstantsIndexBuilding.h" #include "index/Index.h" @@ -166,6 +167,7 @@ int main(int argc, char** argv) { bool addWordsFromLiterals = false; std::optional stxxlMemory; std::optional parserBufferSize; + std::optional vocabType; optind = 1; Index index{ad_utility::makeUnlimitedAllocator()}; @@ -224,6 +226,10 @@ int main(int argc, char** argv) { add("only-pso-and-pos-permutations,o", po::bool_switch(&onlyPsoAndPos), "Only build the PSO and POS permutations. This is faster, but then " "queries with predicate variables are not supported"); + auto msg = absl::StrCat( + "The vocabulary implementation for strings in qlever, can be any of ", + ad_utility::VocabularyType::getListOfSupportedValues()); + add("vocabulary-type", po::value(&vocabType), msg.c_str()); // Options for the index building process. add("stxxl-memory,m", po::value(&stxxlMemory), @@ -257,6 +263,10 @@ int main(int argc, char** argv) { index.parserBufferSize() = parserBufferSize.value(); } + if (vocabType.has_value()) { + index.getImpl().setVocabularyTypeForIndexBuilding(vocabType.value()); + } + // If no text index name was specified, take the part of the wordsfile after // the last slash. if (textIndexName.empty() && !wordsfile.empty()) { diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 3b872eb39c..2f15be7e5c 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -48,7 +48,7 @@ cppcoro::generator IndexImpl::wordsInTextRecords( if (!isLiteral(text)) { continue; } - WordsFileLine entityLine{text, true, contextId, 1, true}; + WordsFileLine entityLine{std::string{text}, true, contextId, 1, true}; co_yield entityLine; std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 24a8f1c77e..adbbde1003 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -343,6 +343,8 @@ void IndexImpl::createFromFiles( "The patterns can only be built when all 6 permutations are created"}; } + vocab_.resetToType(vocabularyTypeForIndexBuilding_); + readIndexBuilderSettingsFromFile(); updateInputFileSpecificationsAndLog(files, useParallelParser_); @@ -562,7 +564,6 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( return (*cmp)(a, b, decltype(vocab_)::SortLevel::TOTAL); }; auto wordCallback = vocab_.makeWordWriter(onDiskBase_ + VOCAB_SUFFIX); - wordCallback.readableName() = "internal vocabulary"; return ad_utility::vocabulary_merger::mergeVocabulary( onDiskBase_, numFiles, sortPred, wordCallback, memoryLimitIndexBuilding()); @@ -976,7 +977,7 @@ size_t IndexImpl::getNumDistinctSubjectPredicatePairs() const { } // _____________________________________________________________________________ -bool IndexImpl::isLiteral(const string& object) const { +bool IndexImpl::isLiteral(std::string_view object) const { return decltype(vocab_)::stringIsLiteral(object); } @@ -1134,6 +1135,11 @@ void IndexImpl::readConfiguration() { loadDataMember("num-triples", numTriples_, NumNormalAndInternal{}); loadDataMember("num-non-literals-text-index", nofNonLiteralsInTextIndex_, 0); + ad_utility::VocabularyType vocabType( + ad_utility::VocabularyType::Enum::OnDiskCompressed); + loadDataMember("vocabulary-type", vocabType, vocabType); + vocab_.resetToType(vocabType); + // Initialize BlankNodeManager uint64_t numBlankNodesTotal; loadDataMember("num-blank-nodes-total", numBlankNodesTotal); @@ -1524,10 +1530,13 @@ size_t IndexImpl::getCardinality( } // ___________________________________________________________________________ -std::string IndexImpl::indexToString(VocabIndex id) const { return vocab_[id]; } +RdfsVocabulary::AccessReturnType IndexImpl::indexToString(VocabIndex id) const { + return vocab_[id]; +} // ___________________________________________________________________________ -std::string_view IndexImpl::indexToString(WordVocabIndex id) const { +TextVocabulary::AccessReturnType IndexImpl::indexToString( + WordVocabIndex id) const { return textVocab_[id]; } diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 8daef8ccd3..a803a5983d 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -193,6 +193,10 @@ class IndexImpl { std::optional idOfHasPatternDuringIndexBuilding_; std::optional idOfInternalGraphDuringIndexBuilding_; + // The vocabulary type that is used (only relevant during index building). + ad_utility::VocabularyType vocabularyTypeForIndexBuilding_{ + ad_utility::VocabularyType::Enum::OnDiskCompressed}; + // BlankNodeManager, initialized during `readConfiguration` std::unique_ptr blankNodeManager_{nullptr}; @@ -276,6 +280,13 @@ class IndexImpl { return deltaTriples_.value(); } + // See the documentation of the `vocabularyTypeForIndexBuilding_` member for + // details. + void setVocabularyTypeForIndexBuilding(ad_utility::VocabularyType type) { + vocabularyTypeForIndexBuilding_ = type; + configurationJson_["vocabulary-type"] = type; + } + // -------------------------------------------------------------------------- // -- RETRIEVAL --- // -------------------------------------------------------------------------- @@ -306,10 +317,10 @@ class IndexImpl { const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; // ___________________________________________________________________________ - std::string indexToString(VocabIndex id) const; + RdfsVocabulary::AccessReturnType indexToString(VocabIndex id) const; // ___________________________________________________________________________ - std::string_view indexToString(WordVocabIndex id) const; + TextVocabulary::AccessReturnType indexToString(WordVocabIndex id) const; public: // ___________________________________________________________________________ @@ -635,7 +646,7 @@ class IndexImpl { friend class IndexTest_createFromOnDiskIndexTest_Test; friend class CreatePatternsFixture_createPatterns_Test; - bool isLiteral(const string& object) const; + bool isLiteral(std::string_view object) const; public: LangtagAndTriple tripleToInternalRepresentation(TurtleTriple&& triple) const; diff --git a/src/index/StringSortComparator.h b/src/index/StringSortComparator.h index b5cf70c6d4..cfdd08dceb 100644 --- a/src/index/StringSortComparator.h +++ b/src/index/StringSortComparator.h @@ -623,6 +623,12 @@ class TripleComponentComparator { return compare(spA, spB, level) < 0; } + bool operator()(const SplitVal& spA, std::string_view b, + const Level level) const { + auto spB = extractAndTransformComparable(b, level, false); + return compare(spA, spB, level) < 0; + } + template bool operator()(const SplitValBase& a, const SplitValBase& b, const Level level) const { diff --git a/src/index/Vocabulary.cpp b/src/index/Vocabulary.cpp index ab2cb52505..80c61cc0ea 100644 --- a/src/index/Vocabulary.cpp +++ b/src/index/Vocabulary.cpp @@ -39,20 +39,8 @@ bool Vocabulary::PrefixRanges::contain( // _____________________________________________________________________________ template void Vocabulary::readFromFile(const string& fileName) { - LOG(INFO) << "Reading vocabulary from file " << fileName << " ..." - << std::endl; vocabulary_.close(); vocabulary_.open(fileName); - if constexpr (isCompressed_) { - const auto& internalExternalVocab = - vocabulary_.getUnderlyingVocabulary().getUnderlyingVocabulary(); - LOG(INFO) << "Done, number of words: " - << internalExternalVocab.internalVocab().size() << std::endl; - LOG(INFO) << "Number of words in external vocabulary: " - << internalExternalVocab.externalVocab().size() << std::endl; - } else { - LOG(INFO) << "Done, number of words: " << vocabulary_.size() << std::endl; - } // Precomputing ranges for IRIs, blank nodes, and literals, for faster // processing of the `isIrI` and `isLiteral` functions. @@ -75,7 +63,17 @@ void Vocabulary::createFromSet( return getCaseComparator()(a, b, SortLevel::TOTAL); }; std::sort(begin(words), end(words), totalComparison); - vocabulary_.build(words, filename); + auto writer = makeWordWriter(filename); + auto writeWords = [&writer](std::string_view word) { + // All words are stored in the internal vocab (this is consistent with the + // previous behavior). NOTE: This function is currently only used for the + // text index and for few unit tests, where we don't have an external + // vocabulary anyway. + writer(word, false); + }; + ql::ranges::for_each(words, writeWords); + writer.finish(); + vocabulary_.open(filename); LOG(DEBUG) << "END Vocabulary::createFromSet" << std::endl; } @@ -88,19 +86,12 @@ bool Vocabulary::stringIsLiteral(std::string_view s) { // _____________________________________________________________________________ template bool Vocabulary::shouldBeExternalized(string_view s) const { - // TODO Completely refactor the Vocabulary on the different - // Types, it is a mess. - - // If the string is not compressed, this means that this is a text vocabulary - // and thus doesn't support externalization. - if constexpr (std::is_same_v) { - if (!stringIsLiteral(s)) { - return shouldEntityBeExternalized(s); - } else { - return shouldLiteralBeExternalized(s); - } + // TODO We should have a completely separate layer that handles the + // externalization, not the Vocab. + if (!stringIsLiteral(s)) { + return shouldEntityBeExternalized(s); } else { - return false; + return shouldLiteralBeExternalized(s); } } @@ -264,17 +255,18 @@ auto Vocabulary::prefixRanges(std::string_view prefix) const } // _____________________________________________________________________________ -template -auto Vocabulary::operator[](IndexType idx) const - -> AccessReturnType_t { +template +auto Vocabulary::operator[](IndexType idx) const + -> AccessReturnType { AD_CONTRACT_CHECK(idx.get() < size()); return vocabulary_[idx.get()]; } // Explicit template instantiations -template class Vocabulary; -template class Vocabulary; +template class Vocabulary; +template class Vocabulary; template void RdfsVocabulary::initializeInternalizedLangs( const nlohmann::json&); diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index 6775a13217..eecf3b832a 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -20,13 +20,10 @@ #include "global/Constants.h" #include "global/Id.h" #include "global/Pattern.h" -#include "index/CompressedString.h" #include "index/StringSortComparator.h" -#include "index/VocabularyOnDisk.h" -#include "index/vocabulary/CompressedVocabulary.h" +#include "index/vocabulary/PolymorphicVocabulary.h" #include "index/vocabulary/UnicodeVocabulary.h" #include "index/vocabulary/VocabularyInMemory.h" -#include "index/vocabulary/VocabularyInternalExternal.h" #include "util/Exception.h" #include "util/HashMap.h" #include "util/HashSet.h" @@ -36,11 +33,6 @@ using std::string; using std::vector; -template -using AccessReturnType_t = - std::conditional_t, - std::string, std::string_view>; - template class IdRange { public: @@ -64,12 +56,16 @@ inline std::ostream& operator<<(std::ostream& stream, } // A vocabulary. Wraps a vector of strings and provides additional methods for -// retrieval. Template parameters that are supported are: -// std::string -> no compression is applied -// CompressedString -> prefix compression is applied -template +// retrieval. +template class Vocabulary { public: + // The type that is returned by the `operator[]` of this vocabulary. Typically + // either `std::string` or `std::string_view`. + using AccessReturnType = + decltype(std::declval()[0]); + // The index ranges for a prefix + a function to check whether a given index // is contained in one of them. // @@ -94,17 +90,6 @@ class Vocabulary { // The different type of data that is stored in the vocabulary enum class Datatypes { Literal, Iri, Float, Date }; - template - using enable_if_compressed = - std::enable_if_t>; - - template - using enable_if_uncompressed = - std::enable_if_t>; - - static constexpr bool isCompressed_ = - std::is_same_v; - // If a literal uses one of these language tags or starts with one of these // prefixes, it will be externalized. By default, everything is externalized. // Both of these settings can be overridden using the `settings.json` file. @@ -114,10 +99,6 @@ class Vocabulary { vector internalizedLangs_; vector externalizedPrefixes_{""}; - using UnderlyingVocabulary = - std::conditional_t, - VocabularyInMemory>; using VocabularyWithUnicodeComparator = UnicodeVocabulary; @@ -132,10 +113,7 @@ class Vocabulary { using SortLevel = typename ComparatorType::Level; using IndexType = IndexT; - template < - typename = std::enable_if_t || - std::is_same_v>> - Vocabulary() {} + Vocabulary() = default; Vocabulary& operator=(Vocabulary&&) noexcept = default; Vocabulary(Vocabulary&&) noexcept = default; @@ -146,10 +124,7 @@ class Vocabulary { // Get the word with the given `idx`. Throw if the `idx` is not contained // in the vocabulary. - AccessReturnType_t operator[](IndexType idx) const; - - // AccessReturnType_t at(IndexType idx) const { return - // operator[](id); } + AccessReturnType operator[](IndexType idx) const; //! Get the number of words in the vocabulary. [[nodiscard]] size_t size() const { return vocabulary_.size(); } @@ -238,11 +213,39 @@ class Vocabulary { // vocabulary. UnderlyingVocabulary::WordWriter makeWordWriter( const std::string& filename) const { + // Note: In GCC this triggers a move construction of the created + // `DiskWriter`, although mandatory copy elision should kick in here + // according to our understanding (and does in clang). We could investigate + // whether this is a bug in GCC or whether we are missing something. return vocabulary_.getUnderlyingVocabulary().makeDiskWriter(filename); } + + // If the `UnderlyingVocabulary` is a `PolymorphicVocabulary`, close the + // vocabulary and set the type of the vocabulary according to the `type` + // argument (see the `PolymorphicVocabulary` class for details). + void resetToType(ad_utility::VocabularyType type) { + if constexpr (std::is_same_v) { + vocabulary_.getUnderlyingVocabulary().resetToType(type); + } + } }; -using RdfsVocabulary = - Vocabulary; -using TextVocabulary = - Vocabulary; +namespace detail { +// Thecompile-time definitions `_QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY` can be +// used to disable the external vocab and the compression of the vocab at +// compile time. NOTE: These change the binary format of QLever's index, so +// changing them requires rebuilding of the indices. + +#ifdef _QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY +using UnderlyingVocabRdfsVocabulary = VocabularyInMemory; +#else +using UnderlyingVocabRdfsVocabulary = PolymorphicVocabulary; +#endif + +using UnderlyingVocabTextVocabulary = VocabularyInMemory; +} // namespace detail + +using RdfsVocabulary = Vocabulary; +using TextVocabulary = Vocabulary; diff --git a/src/index/vocabulary/CMakeLists.txt b/src/index/vocabulary/CMakeLists.txt index bb2dfdd4a3..ce746097da 100644 --- a/src/index/vocabulary/CMakeLists.txt +++ b/src/index/vocabulary/CMakeLists.txt @@ -1,2 +1,4 @@ -add_library(vocabulary VocabularyInMemory.h VocabularyInMemory.cpp VocabularyInMemoryBinSearch.cpp VocabularyInternalExternal.cpp) +add_library(vocabulary VocabularyInMemory.h VocabularyInMemory.cpp VocabularyInMemoryBinSearch.cpp VocabularyInternalExternal.cpp + PolymorphicVocabulary.cpp VocabularyOnDisk.cpp + ) qlever_target_link_libraries(vocabulary) diff --git a/src/index/vocabulary/CompressedVocabulary.h b/src/index/vocabulary/CompressedVocabulary.h index 6620248363..95d7f42f6b 100644 --- a/src/index/vocabulary/CompressedVocabulary.h +++ b/src/index/vocabulary/CompressedVocabulary.h @@ -259,22 +259,19 @@ CPP_template(typename UnderlyingVocabulary, using WordWriter = DiskWriterFromUncompressedWords; // Return a `DiskWriter` that can be used to create the vocabulary. - DiskWriterFromUncompressedWords makeDiskWriter( - const std::string& filename) const { + static DiskWriterFromUncompressedWords makeDiskWriter( + const std::string& filename) { return DiskWriterFromUncompressedWords{ absl::StrCat(filename, wordsSuffix), absl::StrCat(filename, decodersSuffix)}; } - /// Initialize the vocabulary from the given `words`. - // TODO This can be a generic Mixin... - void build(const std::vector& words, - const std::string& filename) { - WordWriter writer = makeDiskWriter(filename); - for (const auto& word : words) { - writer(word); - } - writer.finish(); - open(filename); + + // Return a `unique_ptr`. + static std::unique_ptr makeDiskWriterPtr( + const std::string& filename) { + return std::make_unique( + absl::StrCat(filename, wordsSuffix), + absl::StrCat(filename, decodersSuffix)); } // Access to the underlying vocabulary. diff --git a/src/index/vocabulary/PolymorphicVocabulary.cpp b/src/index/vocabulary/PolymorphicVocabulary.cpp new file mode 100644 index 0000000000..27f48a7db1 --- /dev/null +++ b/src/index/vocabulary/PolymorphicVocabulary.cpp @@ -0,0 +1,90 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include "index/vocabulary/PolymorphicVocabulary.h" + +#include + +// _____________________________________________________________________________ +void PolymorphicVocabulary::open(const std::string& filename) { + std::visit([&filename](auto& vocab) { vocab.open(filename); }, vocab_); +} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::open(const std::string& filename, + VocabularyType type) { + resetToType(type); + open(filename); +} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::close() { + std::visit([](auto& vocab) { return vocab.close(); }, vocab_); +} + +// _____________________________________________________________________________ +size_t PolymorphicVocabulary::size() const { + return std::visit([](auto& vocab) -> size_t { return vocab.size(); }, vocab_); +} + +// _____________________________________________________________________________ +std::string PolymorphicVocabulary::operator[](uint64_t i) const { + return std::visit([i](auto& vocab) { return std::string{vocab[i]}; }, vocab_); +} + +// _____________________________________________________________________________ +PolymorphicVocabulary::WordWriter::WordWriter(WordWriters writer) + : writer_(std::move(writer)) {} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::WordWriter::finish() { + std::visit([](auto& writer) { return writer->finish(); }, writer_); +} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::WordWriter::operator()(std::string_view word, + bool isExternal) { + std::visit( + [&word, isExternal](auto& writer) { return (*writer)(word, isExternal); }, + writer_); +} + +// _____________________________________________________________________________ +auto PolymorphicVocabulary::makeDiskWriter(const std::string& filename) const + -> WordWriter { + return WordWriter{std::visit( + [&filename](auto& vocab) -> WordWriters { + return vocab.makeDiskWriterPtr(filename); + }, + vocab_)}; +} + +// _____________________________________________________________________________ +PolymorphicVocabulary::WordWriter PolymorphicVocabulary::makeDiskWriter( + const std::string& filename, VocabularyType type) { + PolymorphicVocabulary dummyVocab; + dummyVocab.resetToType(type); + return dummyVocab.makeDiskWriter(filename); +} + +// _____________________________________________________________________________ +void PolymorphicVocabulary::resetToType(VocabularyType type) { + close(); + switch (type.value()) { + case VocabularyType::Enum::InMemoryUncompressed: + vocab_.emplace(); + break; + case VocabularyType::Enum::OnDiskUncompressed: + vocab_.emplace(); + break; + case VocabularyType::Enum::InMemoryCompressed: + vocab_.emplace(); + break; + case VocabularyType::Enum::OnDiskCompressed: + vocab_.emplace(); + break; + default: + AD_FAIL(); + } +} diff --git a/src/index/vocabulary/PolymorphicVocabulary.h b/src/index/vocabulary/PolymorphicVocabulary.h new file mode 100644 index 0000000000..02fa12b962 --- /dev/null +++ b/src/index/vocabulary/PolymorphicVocabulary.h @@ -0,0 +1,123 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#pragma once +#include +#include + +#include + +#include "index/vocabulary/CompressedVocabulary.h" +#include "index/vocabulary/VocabularyInMemory.h" +#include "index/vocabulary/VocabularyInternalExternal.h" +#include "index/vocabulary/VocabularyType.h" +#include "util/json.h" + +namespace polymorphic_vocabulary::detail { + +// For `T = std::variant = +// std::variant, +// unique_ptr, ...>`. This is used in the implementation +// of the `PolymorphicVocabulary` below. +template +struct WriterPointers {}; + +template +struct WriterPointers> { + using type = std::variant...>; +}; +} // namespace polymorphic_vocabulary::detail + +// A vocabulary that can at runtime choose between different vocabulary +// implementations. The only restriction is, that a vocabulary can only be read +// from disk with the same implementation that it was written to. +class PolymorphicVocabulary { + public: + using VocabularyType = ad_utility::VocabularyType; + + private: + // Type aliases for all the currently supported vocabularies. If another + // vocabulary is added, don't forget to also register it in the + // `VocabularyType` enum. + using InMemory = VocabularyInMemory; + using External = VocabularyInternalExternal; + using CompressedInMemory = CompressedVocabulary; + using CompressedExternal = CompressedVocabulary; + using Variant = + std::variant; + + // In this variant we store the actual vocabulary. + Variant vocab_; + + public: + // Read a vocabulary with the given `type` from the file with the `filename`. + // A vocabulary with the corresponding `type` must have been previously + // written to that file. + void open(const std::string& filename, VocabularyType type); + + // Close the vocabulary if it is open, and set the underlying vocabulary + // implementation according to the `type` without opening the vocabulary. + void resetToType(VocabularyType type); + + // Same as the overload of `open` above, but expects that the correct + // `VocabularyType` has already been set via `resetToType` above. + void open(const std::string& filename); + + // Close the vocabulary s.t. it consumes no more RAM. + void close(); + + // Return the total number of words in the vocabulary. + size_t size() const; + + // Return the `i`-the word, throw of `i` is out of bounds. + std::string operator[](uint64_t i) const; + + // Same as `std::lower_bound`, return the smallest entry >= `word`. + template + WordAndIndex lower_bound(const String& word, Comp comp) const { + return std::visit( + [&word, &comp](auto& vocab) { + return vocab.lower_bound(word, std::move(comp)); + }, + vocab_); + } + + // Analogous to `lower_bound` (see above). + template + WordAndIndex upper_bound(const String& word, Comp comp) const { + return std::visit( + [&word, &comp](auto& vocab) { + return vocab.upper_bound(word, std::move(comp)); + }, + vocab_); + } + + using WordWriters = + polymorphic_vocabulary::detail::WriterPointers::type; + + // The `WordWriter` is used to write a vocabulary to disk word by word (in + // sorted order). + class WordWriter { + WordWriters writer_; + + public: + // Constructor, used by the `makeDiskWriter` functions below. + explicit WordWriter(WordWriters); + + // This function has to be called after the last word has been written. + void finish(); + + // Write the next word to the vocabulary. + void operator()(std::string_view word, bool isExternal); + }; + + // Create a `WordWriter` that will create a vocabulary with the given `type` + // at the given `filename`. + static WordWriter makeDiskWriter(const std::string& filename, + VocabularyType type); + + // Same as above, but the `VocabularyType` is the currently active type of + // `this`. + WordWriter makeDiskWriter(const std::string& filename) const; +}; diff --git a/src/index/vocabulary/UnicodeVocabulary.h b/src/index/vocabulary/UnicodeVocabulary.h index c215843c0f..66aaaf0d67 100644 --- a/src/index/vocabulary/UnicodeVocabulary.h +++ b/src/index/vocabulary/UnicodeVocabulary.h @@ -100,8 +100,4 @@ class UnicodeVocabulary { const UnicodeComparator& getComparator() const { return _comparator; } void close() { _underlyingVocabulary.close(); } - - void build(const std::vector& v, const std::string& filename) { - _underlyingVocabulary.build(v, filename); - } }; diff --git a/src/index/vocabulary/VocabularyInMemory.cpp b/src/index/vocabulary/VocabularyInMemory.cpp index f3db258d59..a1c82231d3 100644 --- a/src/index/vocabulary/VocabularyInMemory.cpp +++ b/src/index/vocabulary/VocabularyInMemory.cpp @@ -8,9 +8,12 @@ using std::string; // _____________________________________________________________________________ void VocabularyInMemory::open(const string& fileName) { + LOG(INFO) << "Reading vocabulary from file " << fileName << " ..." + << std::endl; _words.clear(); ad_utility::serialization::FileReadSerializer file(fileName); file >> _words; + LOG(INFO) << "Done, number of words: " << size() << std::endl; } // _____________________________________________________________________________ diff --git a/src/index/vocabulary/VocabularyInMemory.h b/src/index/vocabulary/VocabularyInMemory.h index 5ce18fe721..6d68e2a6f6 100644 --- a/src/index/vocabulary/VocabularyInMemory.h +++ b/src/index/vocabulary/VocabularyInMemory.h @@ -68,7 +68,11 @@ class VocabularyInMemory struct WordWriter { typename Words::Writer writer_; explicit WordWriter(const std::string& filename) : writer_{filename} {} - void operator()(std::string_view str) { + + // Write a word. The `isExternalDummy` is only there to have a consistent + // interface with the `VocabularyInternalExternal`. + void operator()(std::string_view str, + [[maybe_unused]] bool isExternalDummy = false) { writer_.push(str.data(), str.size()); } @@ -77,25 +81,21 @@ class VocabularyInMemory // Return a `WordWriter` that directly writes the words to the given // `filename`. The words are not materialized in RAM, but the vocabulary later - // has to be explicitly initizlied via `open(filename)`. - WordWriter makeDiskWriter(const std::string& filename) const { + // has to be explicitly initialized via `open(filename)`. + static WordWriter makeDiskWriter(const std::string& filename) { return WordWriter{filename}; } + // Same as `makeDiskWriter` above, but the result is returned via + // `unique_ptr`. + static std::unique_ptr makeDiskWriterPtr( + const std::string& filename) { + return std::make_unique(filename); + } + /// Clear the vocabulary. void close() { _words.clear(); } - /// Initialize the vocabulary from the given `words`. - void build(const std::vector& words, - const std::string& filename) { - WordWriter writer = makeDiskWriter(filename); - for (const auto& word : words) { - writer(word); - } - writer.finish(); - open(filename); - } - // Const access to the underlying words. auto begin() const { return _words.begin(); } auto end() const { return _words.end(); } diff --git a/src/index/vocabulary/VocabularyInternalExternal.cpp b/src/index/vocabulary/VocabularyInternalExternal.cpp index 62c5e29455..3d3d5fffb5 100644 --- a/src/index/vocabulary/VocabularyInternalExternal.cpp +++ b/src/index/vocabulary/VocabularyInternalExternal.cpp @@ -37,3 +37,15 @@ void VocabularyInternalExternal::WordWriter::finish() { internalWriter_.finish(); externalWriter_.finish(); } + +// _____________________________________________________________________________ +void VocabularyInternalExternal::open(const string& filename) { + LOG(INFO) << "Reading vocabulary from file " << filename << " ..." + << std::endl; + internalVocab_.open(filename + ".internal"); + externalVocab_.open(filename + ".external"); + LOG(INFO) << "Done, number of words: " << size() << std::endl; + LOG(INFO) << "Number of words in internal vocabulary (these are also part " + "of the external vocabulary): " + << internalVocab_.size() << std::endl; +} diff --git a/src/index/vocabulary/VocabularyInternalExternal.h b/src/index/vocabulary/VocabularyInternalExternal.h index f9024369bd..209820c604 100644 --- a/src/index/vocabulary/VocabularyInternalExternal.h +++ b/src/index/vocabulary/VocabularyInternalExternal.h @@ -8,8 +8,8 @@ #include #include -#include "index/VocabularyOnDisk.h" #include "index/vocabulary/VocabularyInMemoryBinSearch.h" +#include "index/vocabulary/VocabularyOnDisk.h" #include "index/vocabulary/VocabularyTypes.h" #include "util/Exception.h" @@ -40,10 +40,7 @@ class VocabularyInternalExternal { // Read the vocabulary from a file. The file must have been created using a // `WordWriter`. - void open(const string& filename) { - internalVocab_.open(filename + ".internal"); - externalVocab_.open(filename + ".external"); - } + void open(const string& filename); // Return the total number of words [[nodiscard]] size_t size() const { return externalVocab_.size(); } @@ -117,6 +114,15 @@ class VocabularyInternalExternal { void finish(); }; + // Return a `WordWriter` or (in the second function) a + // `unique_ptr` for the given filename. + static WordWriter makeDiskWriter(const std::string& filename) { + return WordWriter{filename}; + } + static auto makeDiskWriterPtr(const std::string& filename) { + return std::make_unique(filename); + } + /// Clear the vocabulary. void close() { internalVocab_.close(); } diff --git a/src/index/VocabularyOnDisk.cpp b/src/index/vocabulary/VocabularyOnDisk.cpp similarity index 94% rename from src/index/VocabularyOnDisk.cpp rename to src/index/vocabulary/VocabularyOnDisk.cpp index 251130be26..8f23170300 100644 --- a/src/index/VocabularyOnDisk.cpp +++ b/src/index/vocabulary/VocabularyOnDisk.cpp @@ -2,7 +2,7 @@ // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach -#include "index/VocabularyOnDisk.h" +#include "index/vocabulary/VocabularyOnDisk.h" #include @@ -23,8 +23,8 @@ OffsetAndSize VocabularyOnDisk::getOffsetAndSize(uint64_t i) const { std::string VocabularyOnDisk::operator[](uint64_t idx) const { AD_CONTRACT_CHECK(idx < size()); auto offsetAndSize = getOffsetAndSize(idx); - string result(offsetAndSize._size, '\0'); - file_.read(result.data(), offsetAndSize._size, offsetAndSize._offset); + string result(offsetAndSize.size_, '\0'); + file_.read(result.data(), offsetAndSize.size_, offsetAndSize.offset_); return result; } @@ -88,7 +88,7 @@ VocabularyOnDisk::WordWriter::~WordWriter() { void VocabularyOnDisk::buildFromStringsAndIds( const std::vector>& wordsAndIds, const std::string& fileName) { - return buildFromIterable(wordsAndIds, fileName); + buildFromIterable(wordsAndIds, fileName); } // _____________________________________________________________________________ diff --git a/src/index/VocabularyOnDisk.h b/src/index/vocabulary/VocabularyOnDisk.h similarity index 99% rename from src/index/VocabularyOnDisk.h rename to src/index/vocabulary/VocabularyOnDisk.h index f677ac3e7a..87506a4ed5 100644 --- a/src/index/VocabularyOnDisk.h +++ b/src/index/vocabulary/VocabularyOnDisk.h @@ -86,8 +86,8 @@ class VocabularyOnDisk : public VocabularyBinarySearchMixin { // The offset of a word in `file_` and its size in number of bytes. struct OffsetAndSize { - uint64_t _offset; - uint64_t _size; + uint64_t offset_; + uint64_t size_; }; // Helper function for implementing a random access iterator. diff --git a/src/index/vocabulary/VocabularyType.h b/src/index/vocabulary/VocabularyType.h new file mode 100644 index 0000000000..b3a9cdf245 --- /dev/null +++ b/src/index/vocabulary/VocabularyType.h @@ -0,0 +1,97 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#pragma once + +#include +#include + +#include "util/Random.h" +#include "util/json.h" + +namespace ad_utility { + +// A lightweight enum for the different implementation strategies of the +// `PolymorphicVocabulary`. Also includes operations for conversion to and from +// string. +// TODO Implement a generic mixin that can also be used for other +// enums, especially such used in command-line interfaces. +class VocabularyType { + public: + // The different vocabulary implementations; + enum struct Enum { + InMemoryUncompressed, + OnDiskUncompressed, + InMemoryCompressed, + OnDiskCompressed + }; + + private: + Enum value_ = Enum::InMemoryUncompressed; + + static constexpr size_t numValues_ = 4; + // All possible values. + static constexpr std::array all_{ + Enum::InMemoryUncompressed, Enum::OnDiskUncompressed, + Enum::InMemoryCompressed, Enum::OnDiskCompressed}; + + // The string representations of the enum values. + static constexpr std::array descriptions_{ + "in-memory-uncompressed", "on-disk-uncompressed", "in-memory-compressed", + "on-disk-compressed"}; + + static_assert(all_.size() == descriptions_.size()); + + public: + // Constructors + VocabularyType() = default; + explicit VocabularyType(Enum value) : value_{value} {} + + // Create from a string. The string must be one of the `descriptions_`, + // otherwise a `runtime_error_` is thrown. + static VocabularyType fromString(std::string_view description) { + auto it = ql::ranges::find(descriptions_, description); + if (it == descriptions_.end()) { + throw std::runtime_error{ + absl::StrCat("\"", description, + "\" is not a valid vocabulary type. The currently " + "supported vocabulary types are ", + getListOfSupportedValues())}; + } + return VocabularyType{all().at(it - descriptions_.begin())}; + } + + // Return all the possible enum values as a comma-separated single string. + static std::string getListOfSupportedValues() { + return absl::StrJoin(descriptions_, ", "); + } + + // Convert the enum to the corresponding string. + std::string_view toString() const { + return descriptions_.at(static_cast(value_)); + } + + // Return the actual enum value. + Enum value() const { return value_; } + + // Return a list of all the enum values. + static constexpr const std::array& all() { return all_; } + + // Conversion To JSON. + friend void to_json(nlohmann::json& j, const VocabularyType& vocabEnum) { + j = vocabEnum.toString(); + } + + // Conversion from JSON. + friend void from_json(const nlohmann::json& j, VocabularyType& vocabEnum) { + vocabEnum = VocabularyType::fromString(static_cast(j)); + } + + // Get a random value, useful for fuzz testing. + static VocabularyType random() { + ad_utility::FastRandomIntGenerator r; + return VocabularyType{static_cast(r() % numValues_)}; + } +}; +} // namespace ad_utility diff --git a/src/util/File.h b/src/util/File.h index cde77a4aaf..782e266380 100644 --- a/src/util/File.h +++ b/src/util/File.h @@ -52,6 +52,10 @@ class File { open(filename, mode); } + // Files are move-only types. + File(const File&) = delete; + File& operator=(const File&) = delete; + File& operator=(File&& rhs) noexcept { if (isOpen()) { close(); @@ -63,7 +67,7 @@ class File { return *this; } - File(File&& rhs) : name_{std::move(rhs.name_)}, file_{rhs.file_} { + File(File&& rhs) noexcept : name_{std::move(rhs.name_)}, file_{rhs.file_} { rhs.file_ = nullptr; } diff --git a/src/util/ProgramOptionsHelpers.h b/src/util/ProgramOptionsHelpers.h index ea572769cd..682ecc8527 100644 --- a/src/util/ProgramOptionsHelpers.h +++ b/src/util/ProgramOptionsHelpers.h @@ -8,6 +8,7 @@ #include #include +#include "index/vocabulary/VocabularyType.h" #include "util/Concepts.h" #include "util/MemorySize/MemorySize.h" #include "util/Parameters.h" @@ -55,6 +56,7 @@ void validate(boost::any& v, const std::vector& values, std::optional*, int) { // First parse as a T T* dummy = nullptr; + using namespace boost::program_options; validate(v, values, dummy, 0); // Wrap the T inside std::optional @@ -121,6 +123,22 @@ class ParameterToProgramOptionFactory { } }; +// This function is required to use `VocabularyEnum` in +// `boost::program_options`. +inline void validate(boost::any& v, const std::vector& values, + VocabularyType*, int) { + using namespace boost::program_options; + + // Make sure no previous assignment to 'v' was made. + validators::check_first_occurrence(v); + // Extract the first string from 'values'. If there is more than + // one string, it's an error, and exception will be thrown. + const string& s = validators::get_single_string(values); + + // Convert the string to `MemorySize` and put it into the option. + v = VocabularyType::fromString(s); +} + } // namespace ad_utility #endif // QLEVER_PROGRAMOPTIONSHELPERS_H diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d588e74e55..c51252cdb9 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -201,7 +201,7 @@ addLinkAndDiscoverTest(BatchedPipelineTest) addLinkAndDiscoverTest(TupleHelpersTest) -addLinkAndDiscoverTest(StringSortComparatorTest) +addLinkAndDiscoverTestNoLibs(StringSortComparatorTest) addLinkAndDiscoverTest(PriorityQueueTest) diff --git a/test/StringSortComparatorTest.cpp b/test/StringSortComparatorTest.cpp index ade2178ae0..b6143ec70f 100644 --- a/test/StringSortComparatorTest.cpp +++ b/test/StringSortComparatorTest.cpp @@ -125,6 +125,11 @@ TEST(StringSortComparatorTest, TripleComponentComparatorTotal) { auto bSplit = comparator.extractAndTransformComparable( b, TripleComponentComparator::Level::TOTAL); EXPECT_EQ(ab, comp(aSplit, bSplit)); + EXPECT_EQ(ab, comp(a, bSplit)); + EXPECT_EQ(ab, comp(aSplit, b)); + + EXPECT_EQ(ba, comp(b, aSplit)); + EXPECT_EQ(ba, comp(bSplit, a)); EXPECT_EQ(ba, comp(bSplit, aSplit)); }; diff --git a/test/engine/TextIndexScanTestHelpers.h b/test/engine/TextIndexScanTestHelpers.h index 6ba1b8c6de..0cc6ae74c6 100644 --- a/test/engine/TextIndexScanTestHelpers.h +++ b/test/engine/TextIndexScanTestHelpers.h @@ -22,8 +22,8 @@ inline string getTextRecordFromResultTable(const QueryExecutionContext* qec, result.idTable().getColumn(0)[rowIndex].getTextRecordIndex().get(); if (nofNonLiterals <= textRecordIdFromTable) { // Return when from Literals - return qec->getIndex().indexToString( - VocabIndex::make(textRecordIdFromTable - nofNonLiterals)); + return std::string{qec->getIndex().indexToString( + VocabIndex::make(textRecordIdFromTable - nofNonLiterals))}; } else { // Return when from DocsDB return qec->getIndex().getTextExcerpt( @@ -41,8 +41,8 @@ inline const TextRecordIndex getTextRecordIdFromResultTable( inline string getEntityFromResultTable(const QueryExecutionContext* qec, const ProtoResult& result, const size_t& rowIndex) { - return qec->getIndex().indexToString( - result.idTable().getColumn(1)[rowIndex].getVocabIndex()); + return std::string{qec->getIndex().indexToString( + result.idTable().getColumn(1)[rowIndex].getVocabIndex())}; } // Only use on prefix search results diff --git a/test/index/vocabulary/CMakeLists.txt b/test/index/vocabulary/CMakeLists.txt index 3b4499a751..2db01bd594 100644 --- a/test/index/vocabulary/CMakeLists.txt +++ b/test/index/vocabulary/CMakeLists.txt @@ -1,11 +1,15 @@ -addLinkAndDiscoverTest(VocabularyInMemoryTest vocabulary) +addLinkAndDiscoverTestNoLibs(VocabularyInMemoryTest vocabulary) -addLinkAndDiscoverTest(VocabularyOnDiskTest index) +addLinkAndDiscoverTestNoLibs(VocabularyOnDiskTest index) addLinkAndDiscoverTest(CompressedVocabularyTest vocabulary) -addLinkAndDiscoverTest(UnicodeVocabularyTest vocabulary) +addLinkAndDiscoverTestNoLibs(UnicodeVocabularyTest vocabulary) -addLinkAndDiscoverTest(VocabularyInternalExternalTest vocabulary) +addLinkAndDiscoverTestNoLibs(VocabularyInternalExternalTest vocabulary) -addLinkAndDiscoverTest(VocabularyInMemoryBinSearchTest vocabulary) +addLinkAndDiscoverTestNoLibs(VocabularyInMemoryBinSearchTest vocabulary) + +addLinkAndDiscoverTestNoLibs(PolymorphicVocabularyTest vocabulary) + +addLinkAndDiscoverTestNoLibs(VocabularyTypeTest) diff --git a/test/index/vocabulary/CompressedVocabularyTest.cpp b/test/index/vocabulary/CompressedVocabularyTest.cpp index a1a445e213..8a6f39d2bb 100644 --- a/test/index/vocabulary/CompressedVocabularyTest.cpp +++ b/test/index/vocabulary/CompressedVocabularyTest.cpp @@ -6,10 +6,10 @@ #include "VocabularyTestHelpers.h" #include "backports/algorithm.h" -#include "index/VocabularyOnDisk.h" #include "index/vocabulary/CompressedVocabulary.h" #include "index/vocabulary/PrefixCompressor.h" #include "index/vocabulary/VocabularyInMemory.h" +#include "index/vocabulary/VocabularyOnDisk.h" namespace { diff --git a/test/index/vocabulary/PolymorphicVocabularyTest.cpp b/test/index/vocabulary/PolymorphicVocabularyTest.cpp new file mode 100644 index 0000000000..c5c91ed686 --- /dev/null +++ b/test/index/vocabulary/PolymorphicVocabularyTest.cpp @@ -0,0 +1,53 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include + +#include "index/vocabulary/PolymorphicVocabulary.h" + +using ad_utility::VocabularyType; + +namespace { + +// Test a `PolymorphicVocabulary` with a given `vocabType`. +void testForVocabType(VocabularyType::Enum vocabType) { + VocabularyType type{vocabType}; + std::string filename = + absl::StrCat("polymorphicVocabularyTest.", type.toString(), ".vocab"); + + auto writer = PolymorphicVocabulary::makeDiskWriter(filename, type); + writer("alpha", false); + writer("beta", true); + writer("gamma", false); + writer.finish(); + + PolymorphicVocabulary vocab; + vocab.open(filename, type); + EXPECT_EQ(vocab.size(), 3); + + EXPECT_EQ(vocab[0], "alpha"); + EXPECT_EQ(vocab[1], "beta"); + EXPECT_EQ(vocab[2], "gamma"); + + auto wI = vocab.lower_bound("alx", ql::ranges::less{}); + EXPECT_EQ(wI.index(), 1); + EXPECT_EQ(wI.word(), "beta"); + + wI = vocab.upper_bound("gamma", ql::ranges::less{}); + EXPECT_TRUE(wI.isEnd()); +} +} // namespace + +// Test the general functionality of the `PolymorphicVocabulary` for all the +// possible `VocabularyType`s. +TEST(PolymorphicVocabulary, basicTests) { + ql::ranges::for_each(VocabularyType::all(), &testForVocabType); +} + +// Test a corner case in a `switch` statement. +TEST(PolymorphicVocabulary, invalidVocabularyType) { + PolymorphicVocabulary vocab; + auto invalidType = VocabularyType{static_cast(23401)}; + EXPECT_ANY_THROW(vocab.resetToType(invalidType)); +} diff --git a/test/index/vocabulary/VocabularyInternalExternalTest.cpp b/test/index/vocabulary/VocabularyInternalExternalTest.cpp index 6c41dc415a..08ef9164dc 100644 --- a/test/index/vocabulary/VocabularyInternalExternalTest.cpp +++ b/test/index/vocabulary/VocabularyInternalExternalTest.cpp @@ -34,7 +34,7 @@ class VocabularyCreator { auto createVocabularyImpl(const std::vector& words) { VocabularyInternalExternal vocabulary; { - auto writer = VocabularyInternalExternal::WordWriter(vocabFilename_); + auto writer = VocabularyInternalExternal::makeDiskWriter(vocabFilename_); size_t i = 0; for (auto& word : words) { writer(word, i % 2 == 0); diff --git a/test/index/vocabulary/VocabularyOnDiskTest.cpp b/test/index/vocabulary/VocabularyOnDiskTest.cpp index 54fc934f24..ee9090125e 100644 --- a/test/index/vocabulary/VocabularyOnDiskTest.cpp +++ b/test/index/vocabulary/VocabularyOnDiskTest.cpp @@ -5,7 +5,7 @@ #include #include "./VocabularyTestHelpers.h" -#include "index/VocabularyOnDisk.h" +#include "index/vocabulary/VocabularyOnDisk.h" #include "util/Forward.h" namespace { diff --git a/test/index/vocabulary/VocabularyTypeTest.cpp b/test/index/vocabulary/VocabularyTypeTest.cpp new file mode 100644 index 0000000000..180a82e159 --- /dev/null +++ b/test/index/vocabulary/VocabularyTypeTest.cpp @@ -0,0 +1,37 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include + +#include "index/vocabulary/VocabularyType.h" + +using namespace ad_utility; +// Simple tests for the glorified enum `VocabularyType`. +TEST(VocabularyType, allTests) { + using E = VocabularyType::Enum; + using T = VocabularyType; + T t{}; + EXPECT_EQ(t.value(), E::InMemoryUncompressed); + for (auto e : T::all()) { + EXPECT_EQ(T{e}.value(), e); + } + + t = T::fromString("on-disk-compressed"); + EXPECT_EQ(t.value(), E::OnDiskCompressed); + + EXPECT_ANY_THROW(T::fromString("kartoffelsalat")); + + EXPECT_EQ(T{E::OnDiskUncompressed}.toString(), "on-disk-uncompressed"); + + using namespace ::testing; + EXPECT_THAT(T::getListOfSupportedValues(), + AllOf(HasSubstr("in-memory-uncompressed"), + HasSubstr(", on-disk-uncompressed"))); + + for (auto e : T::all()) { + nlohmann::json j = T{e}; + t = j.get(); + EXPECT_EQ(t.value(), e); + } +} diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index 8e1a693209..6cc5724690 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -186,6 +186,8 @@ Index makeTestIndex(const std::string& indexBasename, index.loadAllPermutations() = loadAllPermutations; qlever::InputFileSpecification spec{inputFilename, qlever::Filetype::Turtle, std::nullopt}; + // randomly choose one of the vocabulary implementations + index.getImpl().setVocabularyTypeForIndexBuilding(VocabularyType::random()); index.createFromFiles({spec}); if (createTextIndex) { if (contentsOfWordsFileAndDocsFile.has_value()) {