Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use LiteralOrIri instead of std::string in LocalVocab #1333

Merged
merged 3 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ else ()
endif ()

## Build targets for address sanitizer
# AddressSanitize

set(CMAKE_C_FLAGS_ASAN
"-fsanitize=address -fsanitize=undefined -fno-optimize-sibling-calls -fsanitize-address-use-after-scope -fno-omit-frame-pointer -g -O1"
CACHE STRING "Flags used by the C compiler during AddressSanitizer builds."
Expand Down
4 changes: 3 additions & 1 deletion benchmark/GroupByHashMapBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,9 @@ auto generateRandomLocalVocabAndIndicesVec = [](size_t n, size_t m) {
for (size_t j = 0; j < m; j++) {
str += alphanum.at(gen());
}
indices.push_back(localVocab.getIndexAndAddIfNotContained(str));
using namespace ad_utility::triple_component;
indices.push_back(localVocab.getIndexAndAddIfNotContained(
LiteralOrIri::literalWithoutQuotes(str)));
}

return std::make_pair(std::move(localVocab), indices);
Expand Down
39 changes: 19 additions & 20 deletions src/engine/ExportQueryExecutionTrees.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,23 @@
return std::nullopt;
}
}

using LiteralOrIri = ad_utility::triple_component::LiteralOrIri;
auto handleIriOrLiteral = [&escapeFunction](const LiteralOrIri& word)
-> std::optional<std::pair<std::string, const char*>> {
if constexpr (onlyReturnLiterals) {
if (!word.isLiteral()) {
return std::nullopt;
}
}
if constexpr (removeQuotesAndAngleBrackets) {
// TODO<joka921> Can we get rid of the string copying here?
return std::pair{
escapeFunction(std::string{asStringViewUnsafe(word.getContent())}),
nullptr};
}
return std::pair{escapeFunction(word.toStringRepresentation()), nullptr};

Check warning on line 215 in src/engine/ExportQueryExecutionTrees.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/ExportQueryExecutionTrees.cpp#L215

Added line #L215 was not covered by tests
};
switch (id.getDatatype()) {
case Datatype::WordVocabIndex: {
std::optional<string> entity =
Expand All @@ -212,28 +229,10 @@
auto litOrIri =
ad_utility::triple_component::LiteralOrIri::fromStringRepresentation(
entity.value());
if constexpr (onlyReturnLiterals) {
if (!litOrIri.isLiteral()) {
return std::nullopt;
}
}
if constexpr (removeQuotesAndAngleBrackets) {
entity = asStringViewUnsafe(litOrIri.getContent());
}
// TODO<joka921> handle the exporting of literals more correctly.
return std::pair{escapeFunction(std::move(entity.value())), nullptr};
return handleIriOrLiteral(litOrIri);
}
case LocalVocabIndex: {
std::string word = localVocab.getWord(id.getLocalVocabIndex());
if constexpr (onlyReturnLiterals) {
if (!word.starts_with('"')) {
return std::nullopt;
}
}
if constexpr (removeQuotesAndAngleBrackets) {
word = RdfEscaping::normalizedContentFromLiteralOrIri(std::move(word));
}
return std::pair{escapeFunction(std::move(word)), nullptr};
return handleIriOrLiteral(localVocab.getWord(id.getLocalVocabIndex()));
}
case TextRecordIndex:
return std::pair{
Expand Down
10 changes: 6 additions & 4 deletions src/engine/GroupByHashMapOptimization.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,7 @@
auto valueIdResultGetter = [](ValueId id) { return id; };
auto stringResultGetter =
[localVocab](const ad_utility::triple_component::LiteralOrIri& str) {
auto localVocabIndex = localVocab->getIndexAndAddIfNotContained(
str.toStringRepresentation());
auto localVocabIndex = localVocab->getIndexAndAddIfNotContained(str);

Check warning on line 93 in src/engine/GroupByHashMapOptimization.h

View check run for this annotation

Codecov / codecov/patch

src/engine/GroupByHashMapOptimization.h#L93

Added line #L93 was not covered by tests
return ValueId::makeFromLocalVocabIndex(localVocabIndex);
};
return std::visit(ad_utility::OverloadCallOperator(valueIdResultGetter,
Expand Down Expand Up @@ -163,8 +162,11 @@

// _____________________________________________________________________________
[[nodiscard]] ValueId calculateResult(LocalVocab* localVocab) const {
auto localVocabIndex =
localVocab->getIndexAndAddIfNotContained(currentValue_);
using namespace ad_utility::triple_component;
using Lit = ad_utility::triple_component::Literal;
auto localVocabIndex = localVocab->getIndexAndAddIfNotContained(
LiteralOrIri{Lit::literalWithNormalizedContent(
asNormalizedStringViewUnsafe(currentValue_))});
return ValueId::makeFromLocalVocabIndex(localVocabIndex);
}

Expand Down
23 changes: 10 additions & 13 deletions src/engine/LocalVocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,32 +31,27 @@ LocalVocab LocalVocab::merge(std::span<const LocalVocab*> vocabs) {
// _____________________________________________________________________________
template <typename WordT>
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) {
// TODO<joka921> As soon as we store `IdOrString` in the local vocab, we
// should definitely use `insert` instead of `emplace` here for some
// transparency optimizations. We currently need `emplace` because of the
// explicit conversion from `string` to `AlignedString16`.
auto [wordIterator, isNewWord] =
primaryWordSet().emplace(std::forward<WordT>(word));
auto [wordIterator, isNewWord] = primaryWordSet().insert(AD_FWD(word));
// TODO<Libc++18> Use std::to_address (more idiomatic, but currently breaks
// the MacOS build.
return &(*wordIterator);
}

// _____________________________________________________________________________
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(
const std::string& word) {
const LiteralOrIri& word) {
return getIndexAndAddIfNotContainedImpl(word);
}

// _____________________________________________________________________________
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(std::string&& word) {
LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(LiteralOrIri&& word) {
return getIndexAndAddIfNotContainedImpl(std::move(word));
}

// _____________________________________________________________________________
std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
const std::string& word) const {
auto localVocabIndex = primaryWordSet().find(StringAligned16{word});
const LiteralOrIri& word) const {
auto localVocabIndex = primaryWordSet().find(word);
if (localVocabIndex != primaryWordSet().end()) {
// TODO<Libc++18> Use std::to_address (more idiomatic, but currently breaks
// the MacOS build.
Expand All @@ -67,13 +62,15 @@ std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
}

// _____________________________________________________________________________
const std::string& LocalVocab::getWord(LocalVocabIndex localVocabIndex) const {
const LocalVocab::LiteralOrIri& LocalVocab::getWord(
LocalVocabIndex localVocabIndex) const {
return *localVocabIndex;
}

// _____________________________________________________________________________
std::vector<std::string> LocalVocab::getAllWordsForTesting() const {
std::vector<std::string> result;
std::vector<LocalVocab::LiteralOrIri> LocalVocab::getAllWordsForTesting()
const {
std::vector<LiteralOrIri> result;
std::ranges::copy(primaryWordSet(), std::back_inserter(result));
for (const auto& previous : otherWordSets_) {
std::ranges::copy(*previous, std::back_inserter(result));
Expand Down
21 changes: 10 additions & 11 deletions src/engine/LocalVocab.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,20 @@

#include "absl/container/node_hash_set.h"
#include "global/Id.h"
#include "parser/LiteralOrIri.h"

// A class for maintaing a local vocabulary with contiguous (local) IDs. This is
// meant for words that are not part of the normal vocabulary (constructed from
// the input data at indexing time).
//
// TODO: This is a first version of this class with basic functionality. Note
// that the local vocabulary used to be a simple `std::vector<std::string>`
// defined inside of the `ResultTable` class. You gotta start somewhere.
class LocalVocab {
private:
using LiteralOrIri = ad_utility::triple_component::LiteralOrIri;
// A map of the words in the local vocabulary to their local IDs. This is a
// node hash map because we need the addresses of the words (which are of type
// `std::string`) to remain stable over their lifetime in the hash map because
// we hand out pointers to them.
using Set = absl::node_hash_set<StringAligned16>;
// `LiteralOrIri`) to remain stable over their lifetime in the hash map
// because we hand out pointers to them.
using Set = absl::node_hash_set<LiteralOrIri>;
std::shared_ptr<Set> primaryWordSet_ = std::make_shared<Set>();

// Local vocabularies from child operations that were merged into this
Expand Down Expand Up @@ -58,13 +57,13 @@ class LocalVocab {
// Get the index of a word in the local vocabulary. If the word was already
// contained, return the already existing index. If the word was not yet
// contained, add it, and return the new index.
LocalVocabIndex getIndexAndAddIfNotContained(const std::string& word);
LocalVocabIndex getIndexAndAddIfNotContained(std::string&& word);
LocalVocabIndex getIndexAndAddIfNotContained(const LiteralOrIri& word);
LocalVocabIndex getIndexAndAddIfNotContained(LiteralOrIri&& word);

// Get the index of a word in the local vocabulary, or std::nullopt if it is
// not contained. This is useful for testing.
std::optional<LocalVocabIndex> getIndexOrNullopt(
const std::string& word) const;
const LiteralOrIri& word) const;

// The number of words in the vocabulary.
// Note: This is not constant time, but linear in the number of word sets.
Expand All @@ -80,14 +79,14 @@ class LocalVocab {
bool empty() const { return size() == 0; }

// Return a const reference to the word.
const std::string& getWord(LocalVocabIndex localVocabIndex) const;
const LiteralOrIri& getWord(LocalVocabIndex localVocabIndex) const;

// Create a local vocab that contains and keeps alive all the words from each
// of the `vocabs`. The primary word set of the newly created vocab is empty.
static LocalVocab merge(std::span<const LocalVocab*> vocabs);

// Return all the words from all the word sets as a vector.
std::vector<std::string> getAllWordsForTesting() const;
std::vector<LiteralOrIri> getAllWordsForTesting() const;

private:
// Common implementation for the two variants of
Expand Down
3 changes: 1 addition & 2 deletions src/engine/sparqlExpressions/SparqlExpressionTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,7 @@ Id constantExpressionResultToId(T&& result, LocalVocabT& localVocab) {
if constexpr (ad_utility::isSimilar<
R, ad_utility::triple_component::LiteralOrIri>) {
return Id::makeFromLocalVocabIndex(
localVocab.getIndexAndAddIfNotContained(
AD_FWD(el).toStringRepresentation()));
localVocab.getIndexAndAddIfNotContained(AD_FWD(el)));
} else {
static_assert(ad_utility::isSimilar<R, Id>);
return el;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ auto EffectiveBooleanValueGetter::operator()(
: True;
}
case Datatype::LocalVocabIndex: {
return (context->_localVocab.getWord(id.getLocalVocabIndex()).empty())
return (context->_localVocab.getWord(id.getLocalVocabIndex())
.getContent()
.empty())
? False
: True;
}
Expand Down
9 changes: 2 additions & 7 deletions src/global/IndexTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#define QLEVER_INDEXTYPES_H

#include "./TypedIndex.h"
#include "parser/LiteralOrIri.h"

// Typedefs for several kinds of typed indices that are used across QLever.

Expand All @@ -14,13 +15,7 @@
// requests.
using VocabIndex = ad_utility::TypedIndex<uint64_t, "VocabIndex">;

// A `std::string` that is aligned to 16 bytes s.t. pointers always end with 4
// bits that are zero and that are reused for payloads in the `ValueId` class.
struct alignas(16) StringAligned16 : public std::string {
using std::string::basic_string;
explicit StringAligned16(std::string s) : std::string{std::move(s)} {}
};
using LocalVocabIndex = const StringAligned16*;
using LocalVocabIndex = const ad_utility::triple_component::LiteralOrIri*;
using TextRecordIndex = ad_utility::TypedIndex<uint64_t, "TextRecordIndex">;
using WordVocabIndex = ad_utility::TypedIndex<uint64_t, "WordVocabIndex">;
using BlankNodeIndex = ad_utility::TypedIndex<uint64_t, "BlankNodeIndex">;
Expand Down
2 changes: 1 addition & 1 deletion src/global/ValueId.h
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ class ValueId {
ostr << value.toStringAndType().first;
} else if constexpr (ad_utility::isSimilar<T, LocalVocabIndex>) {
AD_CORRECTNESS_CHECK(value != nullptr);
ostr << *value;
ostr << value->toStringRepresentation();
} else {
// T is `VocabIndex | TextRecordIndex`
ostr << std::to_string(value.get());
Expand Down
7 changes: 6 additions & 1 deletion src/parser/LiteralOrIri.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ static constexpr char iriPrefixChar = '<';
static constexpr std::string_view iriPrefix{&iriPrefixChar, 1};
static constexpr std::string_view literalPrefix{&literalPrefixChar, 1};
// A wrapper class that can contain either an Iri or a Literal object.
class LiteralOrIri {
class alignas(16) LiteralOrIri {
private:
using LiteralOrIriVariant = std::variant<Literal, Iri>;
LiteralOrIriVariant data_;
Expand Down Expand Up @@ -61,6 +61,11 @@ class LiteralOrIri {
}
bool operator==(const LiteralOrIri&) const = default;

auto operator<=>(const LiteralOrIri& rhs) const {
// TODO<joka921> Use something unicode-based for this.
return toStringRepresentation() <=> rhs.toStringRepresentation();
}

// Return true if object contains an Iri object
bool isIri() const;

Expand Down
19 changes: 8 additions & 11 deletions src/parser/TripleComponent.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,22 +219,19 @@ class TripleComponent {
if (!id) {
// If `toValueId` could not convert to `Id`, we have a string, which we
// look up in (and potentially add to) our local vocabulary.
AD_CORRECTNESS_CHECK(isString() || isLiteral() || isIri());
std::string& newWord = [&]() -> std::string& {
if (isString()) {
return getString();
AD_CORRECTNESS_CHECK(isLiteral() || isIri());
using LiteralOrIri = ad_utility::triple_component::LiteralOrIri;
auto moveWord = [&]() -> LiteralOrIri {
if (isLiteral()) {
return LiteralOrIri{std::move(getLiteral())};
} else {
if (isLiteral()) {
return getLiteral().toStringRepresentation();
} else {
return getIri().toStringRepresentation();
}
return LiteralOrIri{std::move(getIri())};
}
}();
};
// NOTE: There is a `&&` version of `getIndexAndAddIfNotContained`.
// Otherwise, `newWord` would be copied here despite the `std::move`.
id = Id::makeFromLocalVocabIndex(
localVocab.getIndexAndAddIfNotContained(std::move(newWord)));
localVocab.getIndexAndAddIfNotContained(moveWord()));
}
return id.value();
}
Expand Down
17 changes: 12 additions & 5 deletions test/GroupByTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,12 @@ TEST_F(GroupByTest, doGroupBy) {

// Create an input result table with a local vocabulary.
auto localVocab = std::make_shared<LocalVocab>();
localVocab->getIndexAndAddIfNotContained("<local1>");
localVocab->getIndexAndAddIfNotContained("<local2>");
localVocab->getIndexAndAddIfNotContained("<local3>");
constexpr auto iriref = [](const std::string& s) {
return ad_utility::triple_component::LiteralOrIri::iriref(s);
};
localVocab->getIndexAndAddIfNotContained(iriref("<local1>"));
localVocab->getIndexAndAddIfNotContained(iriref("<local2>"));
localVocab->getIndexAndAddIfNotContained(iriref("<local3>"));

IdTable inputData(6, makeAllocator());
// The input data types are KB, KB, VERBATIM, TEXT, FLOAT, STRING.
Expand Down Expand Up @@ -1231,7 +1234,9 @@ TEST_F(GroupByOptimizations, hashMapOptimizationGroupConcatIndex) {

auto getId = makeGetId(qec->getIndex());
auto getLocalVocabId = [&result](const std::string& word) {
auto value = result->localVocab().getIndexOrNullopt(word);
auto lit =
ad_utility::triple_component::LiteralOrIri::literalWithoutQuotes(word);
auto value = result->localVocab().getIndexOrNullopt(lit);
if (value.has_value())
return ValueId::makeFromLocalVocabIndex(value.value());
else
Expand Down Expand Up @@ -1278,7 +1283,9 @@ TEST_F(GroupByOptimizations, hashMapOptimizationGroupConcatLocalVocab) {
auto getId = makeGetId(qec->getIndex());
auto d = DoubleId;
auto getLocalVocabId = [&result](const std::string& word) {
auto value = result->localVocab().getIndexOrNullopt(word);
auto lit =
ad_utility::triple_component::LiteralOrIri::literalWithoutQuotes(word);
auto value = result->localVocab().getIndexOrNullopt(lit);
if (value.has_value())
return ValueId::makeFromLocalVocabIndex(value.value());
else
Expand Down
Loading
Loading