Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support uncompressed or fully in-memory vocabularies #1740

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,11 @@ if (${USE_CPP_17_BACKPORTS})
add_definitions("-DQLEVER_CPP_17 -DCPP_CXX_CONCEPTS=0")
endif()

set(VOCAB_UNCOMPRESSED_IN_MEMORY OFF CACHE BOOL "Store QLever's vocabulary uncompressed and completely in RAM")
if (${VOCAB_UNCOMPRESSED_IN_MEMORY})
add_definitions("-D_QLEVER_VOCAB_UNCOMPRESSED_IN_MEMORY")
endif ()

# Enable the specification of additional linker flags manually from the commandline
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ADDITIONAL_LINKER_FLAGS}")
Expand Down
11 changes: 9 additions & 2 deletions src/engine/ExportQueryExecutionTrees.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,8 +357,15 @@ ExportQueryExecutionTrees::getLiteralOrIriFromVocabIndex(
case Datatype::LocalVocabIndex:
return localVocab.getWord(id.getLocalVocabIndex()).asLiteralOrIri();
case Datatype::VocabIndex: {
auto entity = index.indexToString(id.getVocabIndex());
return LiteralOrIri::fromStringRepresentation(entity);
auto getEntity = [&index, id]() {
return index.indexToString(id.getVocabIndex());
};
// The type of entity might be `string_view` (If the vocabulary is stored
// uncompressed in RAM) or `string` (if it is on-disk, or compressed or
// both). The following code works and is efficient in all cases. In
// particular, the `std::string` constructor is compiled out because of
// RVO if `getEntity()` already returns a `string`.
return LiteralOrIri::fromStringRepresentation(std::string(getEntity()));
}
default:
AD_FAIL();
Expand Down
125 changes: 72 additions & 53 deletions src/global/Pattern.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "util/File.h"
#include "util/Generator.h"
#include "util/Iterators.h"
#include "util/ResetWhenMoved.h"
#include "util/Serializer/FileSerializer.h"
#include "util/Serializer/SerializeVector.h"
#include "util/TypeTraits.h"
Expand All @@ -39,8 +40,8 @@
using ref = value_type&;
using const_ref = const value_type&;

ref operator[](const size_t pos) { return _data[pos]; }
const_ref operator[](const size_t pos) const { return _data[pos]; }
ref operator[](const size_t pos) { return data_[pos]; }

Check warning on line 43 in src/global/Pattern.h

View check run for this annotation

Codecov / codecov/patch

src/global/Pattern.h#L43

Added line #L43 was not covered by tests
const_ref operator[](const size_t pos) const { return data_[pos]; }

using const_iterator = ad_utility::IteratorForAccessOperator<
Pattern, ad_utility::AccessViaBracketOperator, ad_utility::IsConst::True>;
Expand All @@ -51,19 +52,19 @@

bool operator==(const Pattern& other) const = default;

size_t size() const { return _data.size(); }
size_t size() const { return data_.size(); }

void push_back(value_type i) { _data.push_back(i); }
void push_back(value_type i) { data_.push_back(i); }

void clear() { _data.clear(); }
void clear() { data_.clear(); }

const_ref back() const { return _data.back(); }
ref back() { return _data.back(); }
bool empty() const { return _data.empty(); }
const_ref back() const { return data_.back(); }

Check warning on line 61 in src/global/Pattern.h

View check run for this annotation

Codecov / codecov/patch

src/global/Pattern.h#L61

Added line #L61 was not covered by tests
ref back() { return data_.back(); }
bool empty() const { return data_.empty(); }

const value_type* data() const { return _data.data(); }
const value_type* data() const { return data_.data(); }

std::vector<value_type> _data;
std::vector<value_type> data_;
};

namespace detail {
Expand Down Expand Up @@ -113,19 +114,19 @@
static_assert(
ad_utility::SimilarTo<decltype(*(input.begin()->begin())), data_type>);
// Also make room for the end offset of the last element.
_offsets.reserve(input.size() + 1);
offsets_.reserve(input.size() + 1);
size_t dataSize = 0;
for (const auto& element : input) {
_offsets.push_back(dataSize);
offsets_.push_back(dataSize);
dataSize += element.size();
}
// The last offset is the offset right after the last element.
_offsets.push_back(dataSize);
offsets_.push_back(dataSize);

_data.reserve(dataSize);
data_.reserve(dataSize);

for (const auto& el : input) {
_data.insert(_data.end(), el.begin(), el.end());
data_.insert(data_.end(), el.begin(), el.end());
}
}

Expand All @@ -137,9 +138,9 @@
CompactVectorOfStrings(CompactVectorOfStrings&&) noexcept = default;

// There is one more offset than the number of elements.
size_t size() const { return ready() ? _offsets.size() - 1 : 0; }
size_t size() const { return ready() ? offsets_.size() - 1 : 0; }

bool ready() const { return !_offsets.empty(); }
bool ready() const { return !offsets_.empty(); }

/**
* @brief operator []
Expand All @@ -148,9 +149,9 @@
* elements stored at the pointers target.
*/
const value_type operator[](size_t i) const {
offset_type offset = _offsets[i];
const data_type* ptr = _data.data() + offset;
size_t size = _offsets[i + 1] - offset;
offset_type offset = offsets_[i];
const data_type* ptr = data_.data() + offset;
size_t size = offsets_[i + 1] - offset;
return {ptr, size};
}

Expand All @@ -169,80 +170,100 @@

// Allow serialization via the ad_utility::serialization interface.
AD_SERIALIZE_FRIEND_FUNCTION(CompactVectorOfStrings) {
serializer | arg._data;
serializer | arg._offsets;
serializer | arg.data_;
serializer | arg.offsets_;
}

private:
std::vector<data_type> _data;
std::vector<offset_type> _offsets;
std::vector<data_type> data_;
std::vector<offset_type> offsets_;
};

namespace detail {
// Allows the incremental writing of a `CompactVectorOfStrings` directly to a
// file.
template <typename data_type>
struct CompactStringVectorWriter {
ad_utility::File _file;
off_t _startOfFile;
ad_utility::File file_;
off_t startOfFile_;
using offset_type = typename CompactVectorOfStrings<data_type>::offset_type;
std::vector<offset_type> _offsets;
bool _finished = false;
offset_type _nextOffset = 0;
std::vector<offset_type> offsets_;

// A `CompactStringVectorWriter` that has been moved from may not call
// `finish()` any more in its destructor.
ad_utility::ResetWhenMoved<bool, true> finished_ = false;
offset_type nextOffset_ = 0;

explicit CompactStringVectorWriter(const std::string& filename)
: _file{filename, "w"} {
: file_{filename, "w"} {
commonInitialization();
}

explicit CompactStringVectorWriter(ad_utility::File&& file)
: _file{std::move(file)} {
: file_{std::move(file)} {
commonInitialization();
}

void push(const data_type* data, size_t elementSize) {
AD_CONTRACT_CHECK(!_finished);
_offsets.push_back(_nextOffset);
_nextOffset += elementSize;
_file.write(data, elementSize * sizeof(data_type));
AD_CONTRACT_CHECK(!finished_);
offsets_.push_back(nextOffset_);
nextOffset_ += elementSize;
file_.write(data, elementSize * sizeof(data_type));
}

// Finish writing, and return the moved file. If the return value is
// discarded, then the file will be closed immediately by the destructor of
// the `File` class.
ad_utility::File finish() {
if (_finished) {
if (finished_) {
return {};
}
_finished = true;
_offsets.push_back(_nextOffset);
_file.seek(_startOfFile, SEEK_SET);
_file.write(&_nextOffset, sizeof(size_t));
_file.seek(0, SEEK_END);
ad_utility::serialization::FileWriteSerializer f{std::move(_file)};
f << _offsets;
finished_ = true;
offsets_.push_back(nextOffset_);
file_.seek(startOfFile_, SEEK_SET);
file_.write(&nextOffset_, sizeof(size_t));
file_.seek(0, SEEK_END);
ad_utility::serialization::FileWriteSerializer f{std::move(file_)};
f << offsets_;
return std::move(f).file();
}

~CompactStringVectorWriter() {
if (!_finished) {
if (!finished_) {
ad_utility::terminateIfThrows(
[this]() { finish(); },
"Finishing the underlying File of a `CompactStringVectorWriter` "
"during destruction failed");
}
}

// The copy operations would be deleted implicitly (because `File` is not
// copyable.
CompactStringVectorWriter(const CompactStringVectorWriter&) = delete;
CompactStringVectorWriter& operator=(const CompactStringVectorWriter&) =
delete;

// The move operations have to be explicitly defaulted, because we have a
// manually defined destructor.
// Note: The defaulted move operations behave correctly because of the usage
// of `ResetWhenMoved` with the `finished` member.
CompactStringVectorWriter(CompactStringVectorWriter&&) = default;
CompactStringVectorWriter& operator=(CompactStringVectorWriter&&) = default;

private:
// Has to be run by all the constructors
void commonInitialization() {
AD_CONTRACT_CHECK(_file.isOpen());
// We don't known the data size yet.
_startOfFile = _file.tell();
AD_CONTRACT_CHECK(file_.isOpen());
// We don't know the data size yet.
startOfFile_ = file_.tell();
size_t dataSizeDummy = 0;
_file.write(&dataSizeDummy, sizeof(dataSizeDummy));
file_.write(&dataSizeDummy, sizeof(dataSizeDummy));
}
};
static_assert(
std::is_nothrow_move_assignable_v<CompactStringVectorWriter<char>>);
static_assert(
std::is_nothrow_move_constructible_v<CompactStringVectorWriter<char>>);
} // namespace detail

// Forward iterator for a `CompactVectorOfStrings` that reads directly from
Expand Down Expand Up @@ -282,13 +303,11 @@
}
}

namespace std {
template <>
struct hash<Pattern> {
std::size_t operator()(const Pattern& p) const {
struct std::hash<Pattern> {
std::size_t operator()(const Pattern& p) const noexcept {
std::string_view s = std::string_view(
reinterpret_cast<const char*>(p._data.data()), sizeof(Id) * p.size());
reinterpret_cast<const char*>(p.data_.data()), sizeof(Id) * p.size());
return hash<std::string_view>()(s);
}
};
} // namespace std
2 changes: 1 addition & 1 deletion src/index/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
add_subdirectory(vocabulary)
add_library(index
Index.cpp IndexImpl.cpp IndexImpl.Text.cpp
Vocabulary.cpp VocabularyOnDisk.cpp
Vocabulary.cpp
LocatedTriples.cpp Permutation.cpp TextMetaData.cpp
DocsDB.cpp FTSAlgorithms.cpp
PrefixHeuristic.cpp CompressedRelation.cpp
Expand Down
3 changes: 2 additions & 1 deletion src/index/ConstantsIndexBuilding.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ constinit inline std::atomic<size_t> BUFFER_SIZE_PARTIAL_TO_GLOBAL_ID_MAPPINGS =
// the overhead of the metadata that has to be stored per block becomes
// infeasible. 250K seems to be a reasonable tradeoff here.
constexpr inline ad_utility::MemorySize
UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN = 250_kB;
UNCOMPRESSED_BLOCKSIZE_COMPRESSED_METADATA_PER_COLUMN =
ad_utility::MemorySize::kilobytes(250);

constexpr inline size_t NumColumnsIndexBuilding = 4;

Expand Down
5 changes: 3 additions & 2 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,13 @@ size_t Index::getCardinality(
}

// ____________________________________________________________________________
std::string Index::indexToString(VocabIndex id) const {
auto Index::indexToString(VocabIndex id) const -> Vocab::AccessReturnType {
return pimpl_->indexToString(id);
}

// ____________________________________________________________________________
std::string_view Index::indexToString(WordVocabIndex id) const {
auto Index::indexToString(WordVocabIndex id) const
-> TextVocabulary::AccessReturnType {
return pimpl_->indexToString(id);
}

Expand Down
10 changes: 4 additions & 6 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,11 @@ class Index {
// Read necessary metadata into memory and open file handles.
void addTextFromOnDiskIndex();

using Vocab =
Vocabulary<CompressedString, TripleComponentComparator, VocabIndex>;
using Vocab = RdfsVocabulary;
[[nodiscard]] const Vocab& getVocab() const;
Vocab& getNonConstVocabForTesting();

using TextVocab =
Vocabulary<std::string, SimpleStringComparator, WordVocabIndex>;
using TextVocab = TextVocabulary;
[[nodiscard]] const TextVocab& getTextVocab() const;

// Get a (non-owning) pointer to the BlankNodeManager of this Index.
Expand All @@ -132,8 +130,8 @@ class Index {

// TODO<joka921> Once we have an overview over the folding this logic should
// probably not be in the index class.
std::string indexToString(VocabIndex id) const;
std::string_view indexToString(WordVocabIndex id) const;
Vocab::AccessReturnType indexToString(VocabIndex id) const;
TextVocab::AccessReturnType indexToString(WordVocabIndex id) const;

[[nodiscard]] Vocab::PrefixRanges prefixRanges(std::string_view prefix) const;

Expand Down
10 changes: 10 additions & 0 deletions src/index/IndexBuilderMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <string>

#include "CompilationInfo.h"
#include "IndexImpl.h"
#include "global/Constants.h"
#include "index/ConstantsIndexBuilding.h"
#include "index/Index.h"
Expand Down Expand Up @@ -166,6 +167,7 @@ int main(int argc, char** argv) {
bool addWordsFromLiterals = false;
std::optional<ad_utility::MemorySize> stxxlMemory;
std::optional<ad_utility::MemorySize> parserBufferSize;
std::optional<ad_utility::VocabularyType> vocabType;
optind = 1;

Index index{ad_utility::makeUnlimitedAllocator<Id>()};
Expand Down Expand Up @@ -224,6 +226,10 @@ int main(int argc, char** argv) {
add("only-pso-and-pos-permutations,o", po::bool_switch(&onlyPsoAndPos),
"Only build the PSO and POS permutations. This is faster, but then "
"queries with predicate variables are not supported");
auto msg = absl::StrCat(
"The vocabulary implementation for strings in qlever, can be any of ",
ad_utility::VocabularyType::getListOfSupportedValues());
add("vocabulary-type", po::value(&vocabType), msg.c_str());

// Options for the index building process.
add("stxxl-memory,m", po::value(&stxxlMemory),
Expand Down Expand Up @@ -257,6 +263,10 @@ int main(int argc, char** argv) {
index.parserBufferSize() = parserBufferSize.value();
}

if (vocabType.has_value()) {
index.getImpl().setVocabularyTypeForIndexBuilding(vocabType.value());
}

// If no text index name was specified, take the part of the wordsfile after
// the last slash.
if (textIndexName.empty() && !wordsfile.empty()) {
Expand Down
2 changes: 1 addition & 1 deletion src/index/IndexImpl.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
if (!isLiteral(text)) {
continue;
}
WordsFileLine entityLine{text, true, contextId, 1, true};
WordsFileLine entityLine{std::string{text}, true, contextId, 1, true};
co_yield entityLine;
std::string_view textView = text;
textView = textView.substr(0, textView.rfind('"'));
Expand Down
Loading
Loading