Skip to content

Commit

Permalink
Parsing (#37)
Browse files Browse the repository at this point in the history
  • Loading branch information
elshize authored and amallia committed Jan 14, 2019
1 parent 18f8460 commit e3a3121
Show file tree
Hide file tree
Showing 26 changed files with 11,945 additions and 99 deletions.
12 changes: 12 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,21 @@
[submodule "external/mio"]
path = external/mio
url = https://github.com/mandreyel/mio.git
[submodule "external/GSL"]
path = external/GSL
url = https://github.com/Microsoft/GSL.git
[submodule "external/gumbo-parser"]
path = external/gumbo-parser
url = https://github.com/google/gumbo-parser.git
[submodule "external/Catch2"]
path = external/Catch2
url = https://github.com/catchorg/Catch2.git
[submodule "external/boost-cmake"]
path = external/boost-cmake
url = https://github.com/Orphis/boost-cmake.git
[submodule "external/Porter2"]
path = external/Porter2
url = https://github.com/pisa-engine/Porter2.git
[submodule "external/warcpp"]
path = external/warcpp
url = https://github.com/pisa-engine/warcpp.git
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,14 @@ matrix:
packages: ['clang-6.0', 'libstdc++-8-dev']
env: MATRIX_EVAL="CC=clang-6.0 && CXX=clang++-6.0 && COVERAGE=Off"
- os: osx
osx_image: xcode10.1
compiler: clang
env: MATRIX_EVAL="CC=clang && CXX=clang++ && COVERAGE=Off"

# Install dependencies
before_install:
- eval "${MATRIX_EVAL}"

script:
- mkdir build
- cd build
Expand Down
12 changes: 12 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
# include(external/external.cmake)
add_subdirectory(external)

include(ExternalProject)
ExternalProject_Add(gumbo-external
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/gumbo-parser
BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/gumbo-parser
CONFIGURE_COMMAND ./autogen.sh && ./configure --prefix=${CMAKE_BINARY_DIR}/gumbo-parser
BUILD_COMMAND ${MAKE})
add_library(gumbo::gumbo UNKNOWN IMPORTED)
set_target_properties(gumbo::gumbo PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
${CMAKE_CURRENT_SOURCE_DIR}/external/gumbo-parser/src)
set_property(TARGET gumbo::gumbo APPEND PROPERTY IMPORTED_LOCATION
${CMAKE_BINARY_DIR}/gumbo-parser/lib/libgumbo.a)

# Add code coverage
list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/external/CMake-codecov/cmake")
find_package(codecov)
Expand Down
4 changes: 4 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ target_link_libraries(index_perftest
QMX
simdcomp
ParallelSTL
GSL
)

add_executable(perftest_interpolative perftest_interpolative.cpp)
Expand All @@ -17,6 +18,7 @@ target_link_libraries(perftest_interpolative
FastPFor
QMX
simdcomp
GSL
)

add_executable(selective_queries selective_queries.cpp)
Expand All @@ -29,11 +31,13 @@ target_link_libraries(selective_queries
QMX
simdcomp
ParallelSTL
GSL
)

add_executable(scan_perftest scan_perftest.cpp)
target_link_libraries(scan_perftest
Boost::boost
mio
ParallelSTL
GSL
)
15 changes: 12 additions & 3 deletions external/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,21 @@ target_compile_options(ParallelSTL INTERFACE -Wno-unused-parameter
-Wno-unknown-pragmas
-Wno-sign-compare
-Wno-reorder
-Wno-unused-local-typedef
-Wno-unused-local-typedefs
)

# Add Catch
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/Catch2)
# Add GSL
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/GSL EXCLUDE_FROM_ALL)

# Add Boost
add_subdirectory(boost-cmake)

# Add Catch
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/Catch2)

# Add Porter2
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/Porter2)

# Add warcpp
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/warcpp)

1 change: 1 addition & 0 deletions external/GSL
Submodule GSL added at 0f68d1
1 change: 1 addition & 0 deletions external/Porter2
Submodule Porter2 added at d9a7b8
1 change: 1 addition & 0 deletions external/gumbo-parser
Submodule gumbo-parser added at aa91b2
1 change: 1 addition & 0 deletions external/warcpp
Submodule warcpp added at aabed2
150 changes: 64 additions & 86 deletions include/binary_collection.hpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
#pragma once

#include <stdexcept>
#include <iterator>
#include <cstdint>
#include <iterator>
#include <stdexcept>
#include <type_traits>

#include "mio/mmap.hpp"

#include "util/util.hpp"

#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
Expand All @@ -12,19 +15,22 @@

namespace ds2i {

class binary_collection {
public:
typedef uint32_t posting_type;
template <typename Source = mio::mmap_source>
class base_binary_collection {
public:
using posting_type = uint32_t;
using pointer = typename std::conditional<std::is_same<Source, mio::mmap_source>::value,
posting_type const,
posting_type>::type *;

binary_collection(const char* filename)
{
base_binary_collection(const char *filename) {
std::error_code error;
m_file.map(filename, error);
if ( error ) {
std::cerr << "error mapping file: " << error.message() << ", exiting..." << std::endl;
throw std::runtime_error("Error opening file");
}
m_data = (posting_type const*)m_file.data();
m_data = reinterpret_cast<pointer>(m_file.data());
m_data_size = m_file.size() / sizeof(m_data[0]);

#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
Expand All @@ -34,100 +40,70 @@ namespace ds2i {
#endif
}

class iterator;

iterator begin() const
{
return iterator(this, 0);
}

iterator end() const
{
return iterator(this, m_data_size);
}

class sequence {
public:
sequence()
: m_begin(nullptr)
, m_end(nullptr)
{}

posting_type const* begin() const
{
return m_begin;
}
public:
sequence(pointer begin, pointer end) : m_begin(begin), m_end(end) {}
sequence() : m_begin(nullptr), m_end(nullptr) {}

posting_type const* end() const
{
return m_end;
}
pointer begin() const { return m_begin; }
pointer end() const { return m_end; }
size_t size() const { return m_end - m_begin; }

posting_type back() const
{
assert(size());
return *(m_end - 1);
}

size_t size() const
{
return m_end - m_begin;
}
private:
pointer m_begin;
pointer m_end;
};

private:
friend class binary_collection::iterator;
using const_sequence = sequence;

sequence(posting_type const* begin, posting_type const* end)
: m_begin(begin)
, m_end(end)
{}
template <typename S>
class base_iterator;

posting_type const* m_begin;
posting_type const* m_end;
};
using const_iterator = base_iterator<const_sequence>;
using iterator = typename std::conditional<std::is_same<Source, mio::mmap_source>::value,
const_iterator,
base_iterator<sequence>>::type;

class iterator : public std::iterator<std::forward_iterator_tag,
sequence> {
public:
iterator()
: m_collection(nullptr)
{}
iterator begin() { return iterator(this, 0); }
iterator end() { return iterator(this, m_data_size); }
const_iterator begin() const { return const_iterator(this, 0); }
const_iterator end() const { return const_iterator(this, m_data_size); }
const_iterator cbegin() const { return const_iterator(this, 0); }
const_iterator cend() const { return const_iterator(this, m_data_size); }

value_type const& operator*() const
{
return m_cur_seq;
}
template <typename S>
class base_iterator : public std::iterator<std::forward_iterator_tag, S> {
public:
base_iterator() : m_collection(nullptr) {}

value_type const* operator->() const
{
return &m_cur_seq;
}
auto const &operator*() const { return m_cur_seq; }

iterator& operator++()
{
auto const *operator-> () const { return &m_cur_seq; }

base_iterator &operator++() {
m_pos = m_next_pos;
read();
return *this;
}

bool operator==(iterator const& other) const
{
bool operator==(base_iterator const &other) const {
assert(m_collection == other.m_collection);
return m_pos == other.m_pos;
}

bool operator!=(iterator const& other) const
{
return !(*this == other);
}
bool operator!=(base_iterator const &other) const { return !(*this == other); }

private:
friend class binary_collection;
private:
friend class base_binary_collection;

iterator(binary_collection const* coll, size_t pos)
: m_collection(coll)
, m_pos(pos)
{
base_iterator(base_binary_collection const *coll, size_t pos)
: m_collection(coll), m_pos(pos) {
read();
}

Expand All @@ -138,24 +114,26 @@ namespace ds2i {

size_t n = 0;
size_t pos = m_pos;
while (!(n = m_collection->m_data[pos++])); // skip empty seqs
n = m_collection->m_data[pos++];
// file might be truncated
n = std::min(n, size_t(m_collection->m_data_size - pos));
posting_type const* begin = &m_collection->m_data[pos];
posting_type const* end = begin + n;
auto begin = &m_collection->m_data[pos];

m_next_pos = pos + n;
m_cur_seq = sequence(begin, end);
m_cur_seq = S(begin, begin + n);
}

binary_collection const* m_collection;
size_t m_pos, m_next_pos;
sequence m_cur_seq;
base_binary_collection const * m_collection;
size_t m_pos, m_next_pos;
S m_cur_seq;
};

private:
mio::mmap_source m_file;
posting_type const* m_data;
size_t m_data_size;
private:
Source m_file;
pointer m_data;
size_t m_data_size;
};

using binary_collection = base_binary_collection<>;
using writable_binary_collection = base_binary_collection<mio::mmap_sink>;
}
16 changes: 7 additions & 9 deletions include/binary_freq_collection.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ namespace ds2i {
}

struct sequence {
binary_collection::sequence docs;
binary_collection::sequence freqs;
binary_collection::const_sequence docs;
binary_collection::const_sequence freqs;
};

class iterator : public std::iterator<std::forward_iterator_tag,
Expand Down Expand Up @@ -86,17 +86,15 @@ namespace ds2i {
private:
friend class binary_freq_collection;

iterator(binary_collection::iterator docs_it,
binary_collection::iterator freqs_it)
: m_docs_it(docs_it)
, m_freqs_it(freqs_it)
{
iterator(binary_collection::const_iterator docs_it,
binary_collection::const_iterator freqs_it)
: m_docs_it(docs_it), m_freqs_it(freqs_it) {
m_cur_seq.docs = *m_docs_it;
m_cur_seq.freqs = *m_freqs_it;
}

binary_collection::iterator m_docs_it;
binary_collection::iterator m_freqs_it;
binary_collection::const_iterator m_docs_it;
binary_collection::const_iterator m_freqs_it;
sequence m_cur_seq;
};

Expand Down
Loading

0 comments on commit e3a3121

Please sign in to comment.