Skip to content

Commit

Permalink
Ranked OR TAAT (#73)
Browse files Browse the repository at this point in the history
  • Loading branch information
amallia authored Jan 21, 2019
1 parent fe849bb commit 790b7bd
Show file tree
Hide file tree
Showing 19 changed files with 223 additions and 104 deletions.
3 changes: 0 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ list(APPEND LCOV_REMOVE_PATTERNS "'${PROJECT_SOURCE_DIR}/external/*'")


if (UNIX)

# For hardware popcount and other special instructions
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")

Expand All @@ -62,8 +61,6 @@ endif()

set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
link_libraries(Threads::Threads)


include_directories(include)
add_library(pisa INTERFACE)
Expand Down
15 changes: 15 additions & 0 deletions include/pisa/accumulator/simple_accumulator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#pragma once

namespace pisa {

struct Simple_Accumulator : public std::vector<float> {
Simple_Accumulator(std::ptrdiff_t size) : std::vector<float>(size) {}
void init() { std::fill(begin(), end(), 0.0); }
void accumulate(uint32_t doc, float score) { operator[](doc) += score; }
void aggregate(topk_queue &topk) {
uint64_t docid = 0u;
std::for_each(begin(), end(), [&](auto score) { topk.insert(score, docid++); });
}
};

}
1 change: 1 addition & 0 deletions include/pisa/block_posting_list.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ namespace pisa {

class document_enumerator {
public:

document_enumerator(uint8_t const* data, uint64_t universe,
size_t term_id = 0)
: m_n(0) // just to silence warnings
Expand Down
1 change: 1 addition & 0 deletions include/pisa/freq_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ namespace pisa {

class document_enumerator {
public:

void reset()
{
m_cur_pos = 0;
Expand Down
14 changes: 9 additions & 5 deletions include/pisa/query/algorithm/and_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

namespace pisa {

template <bool with_freqs>
template <typename Index, bool with_freqs>
struct and_query {

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec terms) const {
and_query(Index const &index) : m_index(index) {}

uint64_t operator()(term_id_vec terms) const {
if (terms.empty())
return 0;
remove_duplicate_terms(terms);
Expand All @@ -16,7 +17,7 @@ struct and_query {
enums.reserve(terms.size());

for (auto term : terms) {
enums.push_back(index[term]);
enums.push_back(m_index[term]);
}

// sort by increasing frequency
Expand All @@ -27,7 +28,7 @@ struct and_query {
uint64_t results = 0;
uint64_t candidate = enums[0].docid();
size_t i = 1;
while (candidate < index.num_docs()) {
while (candidate < m_index.num_docs()) {
for (; i < enums.size(); ++i) {
enums[i].next_geq(candidate);
if (enums[i].docid() != candidate) {
Expand All @@ -52,6 +53,9 @@ struct and_query {
}
return results;
}

private:
Index const &m_index;
};

} // namespace pisa
17 changes: 9 additions & 8 deletions include/pisa/query/algorithm/block_max_maxscore_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@

namespace pisa {

template <typename WandType>
template <typename Index, typename WandType>
struct block_max_maxscore_query {

typedef bm25 scorer_type;

block_max_maxscore_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {}
block_max_maxscore_query(Index const &index, WandType const &wdata, uint64_t k)
: m_index(index), m_wdata(&wdata), m_topk(k) {}

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec const &terms) {
uint64_t operator()(term_id_vec const &terms) {
m_topk.clear();
if (terms.empty())
return 0;

auto query_term_freqs = query_freqs(terms);

uint64_t num_docs = index.num_docs();
uint64_t num_docs = m_index.num_docs();
typedef typename Index::document_enumerator enum_type;
typedef typename WandType::wand_data_enumerator wdata_enum;

Expand All @@ -32,7 +32,7 @@ struct block_max_maxscore_query {
enums.reserve(query_term_freqs.size());

for (auto term : query_term_freqs) {
auto list = index[term.first];
auto list = m_index[term.first];
auto w_enum = m_wdata->getenum(term.first);
auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs);
auto max_weight = q_weight * m_wdata->max_term_weight(term.first);
Expand Down Expand Up @@ -66,10 +66,10 @@ struct block_max_maxscore_query {
})
->docs_enum.docid();

while (non_essential_lists < ordered_enums.size() && cur_doc < index.num_docs()) {
while (non_essential_lists < ordered_enums.size() && cur_doc < m_index.num_docs()) {
float score = 0;
float norm_len = m_wdata->norm_len(cur_doc);
uint64_t next_doc = index.num_docs();
uint64_t next_doc = m_index.num_docs();
for (size_t i = non_essential_lists; i < ordered_enums.size(); ++i) {
if (ordered_enums[i]->docs_enum.docid() == cur_doc) {
score +=
Expand Down Expand Up @@ -129,6 +129,7 @@ struct block_max_maxscore_query {
std::vector<std::pair<float, uint64_t>> const &topk() const { return m_topk.topk(); }

private:
Index const & m_index;
WandType const *m_wdata;
topk_queue m_topk;
};
Expand Down
13 changes: 7 additions & 6 deletions include/pisa/query/algorithm/block_max_wand_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@

namespace pisa {

template <typename WandType>
template <typename Index, typename WandType>
struct block_max_wand_query {
typedef bm25 scorer_type;

block_max_wand_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {}
block_max_wand_query(Index const &index, WandType const &wdata, uint64_t k)
: m_index(index), m_wdata(&wdata), m_topk(k) {}

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec const &terms) {
uint64_t operator()(term_id_vec const &terms) {
m_topk.clear();

if (terms.empty())
return 0;
auto query_term_freqs = query_freqs(terms);
uint64_t num_docs = index.num_docs();
uint64_t num_docs = m_index.num_docs();
typedef typename Index::document_enumerator enum_type;
typedef typename WandType::wand_data_enumerator wdata_enum;

Expand All @@ -30,7 +30,7 @@ struct block_max_wand_query {
enums.reserve(query_term_freqs.size());

for (auto term : query_term_freqs) {
auto list = index[term.first];
auto list = m_index[term.first];
auto w_enum = m_wdata->getenum(term.first);
auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs);

Expand Down Expand Up @@ -204,6 +204,7 @@ struct block_max_wand_query {
topk_queue const &get_topk() const { return m_topk; }

private:
Index const & m_index;
WandType const *m_wdata;
topk_queue m_topk;
};
Expand Down
16 changes: 8 additions & 8 deletions include/pisa/query/algorithm/maxscore_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,21 @@

namespace pisa {

template <typename WandType>
template <typename Index, typename WandType>
struct maxscore_query {

typedef bm25 scorer_type;

maxscore_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {}
maxscore_query(Index const &index, WandType const &wdata, uint64_t k) : m_index(index), m_wdata(&wdata), m_topk(k) {}

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec const &terms) {
uint64_t operator()(term_id_vec const &terms) {
m_topk.clear();
if (terms.empty())
return 0;

auto query_term_freqs = query_freqs(terms);

uint64_t num_docs = index.num_docs();
uint64_t num_docs = m_index.num_docs();
typedef typename Index::document_enumerator enum_type;
struct scored_enum {
enum_type docs_enum;
Expand All @@ -29,7 +28,7 @@ struct maxscore_query {
enums.reserve(query_term_freqs.size());

for (auto term : query_term_freqs) {
auto list = index[term.first];
auto list = m_index[term.first];
auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs);
auto max_weight = q_weight * m_wdata->max_term_weight(term.first);
enums.push_back(scored_enum{std::move(list), q_weight, max_weight});
Expand Down Expand Up @@ -62,10 +61,10 @@ struct maxscore_query {
})
->docs_enum.docid();

while (non_essential_lists < ordered_enums.size() && cur_doc < index.num_docs()) {
while (non_essential_lists < ordered_enums.size() && cur_doc < m_index.num_docs()) {
float score = 0;
float norm_len = m_wdata->norm_len(cur_doc);
uint64_t next_doc = index.num_docs();
uint64_t next_doc = m_index.num_docs();
for (size_t i = non_essential_lists; i < ordered_enums.size(); ++i) {
if (ordered_enums[i]->docs_enum.docid() == cur_doc) {
score +=
Expand Down Expand Up @@ -109,6 +108,7 @@ struct maxscore_query {
std::vector<std::pair<float, uint64_t>> const &topk() const { return m_topk.topk(); }

private:
Index const & m_index;
WandType const *m_wdata;
topk_queue m_topk;
};
Expand Down
16 changes: 10 additions & 6 deletions include/pisa/query/algorithm/or_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

namespace pisa {

template <bool with_freqs>
template <typename Index, bool with_freqs>
struct or_query {

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec terms) const {
or_query(Index const &index) : m_index(index) {}

uint64_t operator()(term_id_vec terms) const {
if (terms.empty())
return 0;
remove_duplicate_terms(terms);
Expand All @@ -16,7 +17,7 @@ struct or_query {
enums.reserve(terms.size());

for (auto term : terms) {
enums.push_back(index[term]);
enums.push_back(m_index[term]);
}

uint64_t results = 0;
Expand All @@ -27,9 +28,9 @@ struct or_query {
})
->docid();

while (cur_doc < index.num_docs()) {
while (cur_doc < m_index.num_docs()) {
results += 1;
uint64_t next_doc = index.num_docs();
uint64_t next_doc = m_index.num_docs();
for (size_t i = 0; i < enums.size(); ++i) {
if (enums[i].docid() == cur_doc) {
if (with_freqs) {
Expand All @@ -47,6 +48,9 @@ struct or_query {

return results;
}

private:
Index const &m_index;
};

} // namespace pisa
15 changes: 8 additions & 7 deletions include/pisa/query/algorithm/ranked_and_query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,23 @@

namespace pisa {

template <typename WandType>
template <typename Index, typename WandType>
struct ranked_and_query {

typedef bm25 scorer_type;

ranked_and_query(WandType const &wdata, uint64_t k) : m_wdata(&wdata), m_topk(k) {}
ranked_and_query(Index const &index, WandType const &wdata, uint64_t k)
: m_index(index), m_wdata(&wdata), m_topk(k) {}

template <typename Index>
uint64_t operator()(Index const &index, term_id_vec terms) {
uint64_t operator()(term_id_vec terms) {
size_t results = 0;
m_topk.clear();
if (terms.empty())
return 0;

auto query_term_freqs = query_freqs(terms);

uint64_t num_docs = index.num_docs();
uint64_t num_docs = m_index.num_docs();
typedef typename Index::document_enumerator enum_type;
struct scored_enum {
enum_type docs_enum;
Expand All @@ -29,7 +29,7 @@ struct ranked_and_query {
enums.reserve(query_term_freqs.size());

for (auto term : query_term_freqs) {
auto list = index[term.first];
auto list = m_index[term.first];
auto q_weight = scorer_type::query_term_weight(term.second, list.size(), num_docs);
enums.push_back(scored_enum{std::move(list), q_weight});
}
Expand All @@ -41,7 +41,7 @@ struct ranked_and_query {

uint64_t candidate = enums[0].docs_enum.docid();
size_t i = 1;
while (candidate < index.num_docs()) {
while (candidate < m_index.num_docs()) {
for (; i < enums.size(); ++i) {
enums[i].docs_enum.next_geq(candidate);
if (enums[i].docs_enum.docid() != candidate) {
Expand Down Expand Up @@ -80,6 +80,7 @@ struct ranked_and_query {
topk_queue &get_topk() { return m_topk; }

private:
Index const & m_index;
WandType const *m_wdata;
topk_queue m_topk;
};
Expand Down
Loading

0 comments on commit 790b7bd

Please sign in to comment.