Skip to content

Commit e3a3121

Browse files
elshizeamallia
authored andcommitted
Parsing (#37)
1 parent 18f8460 commit e3a3121

26 files changed

+11945
-99
lines changed

.gitmodules

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,21 @@
3131
[submodule "external/mio"]
3232
path = external/mio
3333
url = https://github.com/mandreyel/mio.git
34+
[submodule "external/GSL"]
35+
path = external/GSL
36+
url = https://github.com/Microsoft/GSL.git
37+
[submodule "external/gumbo-parser"]
38+
path = external/gumbo-parser
39+
url = https://github.com/google/gumbo-parser.git
3440
[submodule "external/Catch2"]
3541
path = external/Catch2
3642
url = https://github.com/catchorg/Catch2.git
3743
[submodule "external/boost-cmake"]
3844
path = external/boost-cmake
3945
url = https://github.com/Orphis/boost-cmake.git
46+
[submodule "external/Porter2"]
47+
path = external/Porter2
48+
url = https://github.com/pisa-engine/Porter2.git
49+
[submodule "external/warcpp"]
50+
path = external/warcpp
51+
url = https://github.com/pisa-engine/warcpp.git

.travis.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,14 @@ matrix:
5656
packages: ['clang-6.0', 'libstdc++-8-dev']
5757
env: MATRIX_EVAL="CC=clang-6.0 && CXX=clang++-6.0 && COVERAGE=Off"
5858
- os: osx
59+
osx_image: xcode10.1
5960
compiler: clang
6061
env: MATRIX_EVAL="CC=clang && CXX=clang++ && COVERAGE=Off"
6162

6263
# Install dependencies
6364
before_install:
6465
- eval "${MATRIX_EVAL}"
65-
66+
6667
script:
6768
- mkdir build
6869
- cd build

CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,18 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
2020
# include(external/external.cmake)
2121
add_subdirectory(external)
2222

23+
include(ExternalProject)
24+
ExternalProject_Add(gumbo-external
25+
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/gumbo-parser
26+
BINARY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/gumbo-parser
27+
CONFIGURE_COMMAND ./autogen.sh && ./configure --prefix=${CMAKE_BINARY_DIR}/gumbo-parser
28+
BUILD_COMMAND ${MAKE})
29+
add_library(gumbo::gumbo UNKNOWN IMPORTED)
30+
set_target_properties(gumbo::gumbo PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
31+
${CMAKE_CURRENT_SOURCE_DIR}/external/gumbo-parser/src)
32+
set_property(TARGET gumbo::gumbo APPEND PROPERTY IMPORTED_LOCATION
33+
${CMAKE_BINARY_DIR}/gumbo-parser/lib/libgumbo.a)
34+
2335
# Add code coverage
2436
list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/external/CMake-codecov/cmake")
2537
find_package(codecov)

benchmarks/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ target_link_libraries(index_perftest
88
QMX
99
simdcomp
1010
ParallelSTL
11+
GSL
1112
)
1213

1314
add_executable(perftest_interpolative perftest_interpolative.cpp)
@@ -17,6 +18,7 @@ target_link_libraries(perftest_interpolative
1718
FastPFor
1819
QMX
1920
simdcomp
21+
GSL
2022
)
2123

2224
add_executable(selective_queries selective_queries.cpp)
@@ -29,11 +31,13 @@ target_link_libraries(selective_queries
2931
QMX
3032
simdcomp
3133
ParallelSTL
34+
GSL
3235
)
3336

3437
add_executable(scan_perftest scan_perftest.cpp)
3538
target_link_libraries(scan_perftest
3639
Boost::boost
3740
mio
3841
ParallelSTL
42+
GSL
3943
)

external/CMakeLists.txt

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,21 @@ target_compile_options(ParallelSTL INTERFACE -Wno-unused-parameter
7575
-Wno-unknown-pragmas
7676
-Wno-sign-compare
7777
-Wno-reorder
78-
-Wno-unused-local-typedef
78+
-Wno-unused-local-typedefs
7979
)
8080

81-
# Add Catch
82-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/Catch2)
81+
# Add GSL
82+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/GSL EXCLUDE_FROM_ALL)
8383

8484
# Add Boost
8585
add_subdirectory(boost-cmake)
8686

87+
# Add Catch
88+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/Catch2)
89+
90+
# Add Porter2
91+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/Porter2)
92+
93+
# Add warcpp
94+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/warcpp)
95+

external/GSL

Submodule GSL added at 0f68d13

external/Porter2

Submodule Porter2 added at d9a7b82

external/gumbo-parser

Submodule gumbo-parser added at aa91b27

external/warcpp

Submodule warcpp added at aabed20

include/binary_collection.hpp

Lines changed: 64 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
#pragma once
22

3-
#include <stdexcept>
4-
#include <iterator>
53
#include <cstdint>
4+
#include <iterator>
5+
#include <stdexcept>
6+
#include <type_traits>
7+
68
#include "mio/mmap.hpp"
9+
710
#include "util/util.hpp"
811

912
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
@@ -12,19 +15,22 @@
1215

1316
namespace ds2i {
1417

15-
class binary_collection {
16-
public:
17-
typedef uint32_t posting_type;
18+
template <typename Source = mio::mmap_source>
19+
class base_binary_collection {
20+
public:
21+
using posting_type = uint32_t;
22+
using pointer = typename std::conditional<std::is_same<Source, mio::mmap_source>::value,
23+
posting_type const,
24+
posting_type>::type *;
1825

19-
binary_collection(const char* filename)
20-
{
26+
base_binary_collection(const char *filename) {
2127
std::error_code error;
2228
m_file.map(filename, error);
2329
if ( error ) {
2430
std::cerr << "error mapping file: " << error.message() << ", exiting..." << std::endl;
2531
throw std::runtime_error("Error opening file");
2632
}
27-
m_data = (posting_type const*)m_file.data();
33+
m_data = reinterpret_cast<pointer>(m_file.data());
2834
m_data_size = m_file.size() / sizeof(m_data[0]);
2935

3036
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
@@ -34,100 +40,70 @@ namespace ds2i {
3440
#endif
3541
}
3642

37-
class iterator;
38-
39-
iterator begin() const
40-
{
41-
return iterator(this, 0);
42-
}
43-
44-
iterator end() const
45-
{
46-
return iterator(this, m_data_size);
47-
}
48-
4943
class sequence {
50-
public:
51-
sequence()
52-
: m_begin(nullptr)
53-
, m_end(nullptr)
54-
{}
55-
56-
posting_type const* begin() const
57-
{
58-
return m_begin;
59-
}
44+
public:
45+
sequence(pointer begin, pointer end) : m_begin(begin), m_end(end) {}
46+
sequence() : m_begin(nullptr), m_end(nullptr) {}
6047

61-
posting_type const* end() const
62-
{
63-
return m_end;
64-
}
48+
pointer begin() const { return m_begin; }
49+
pointer end() const { return m_end; }
50+
size_t size() const { return m_end - m_begin; }
6551

6652
posting_type back() const
6753
{
6854
assert(size());
6955
return *(m_end - 1);
7056
}
7157

72-
size_t size() const
73-
{
74-
return m_end - m_begin;
75-
}
58+
private:
59+
pointer m_begin;
60+
pointer m_end;
61+
};
7662

77-
private:
78-
friend class binary_collection::iterator;
63+
using const_sequence = sequence;
7964

80-
sequence(posting_type const* begin, posting_type const* end)
81-
: m_begin(begin)
82-
, m_end(end)
83-
{}
65+
template <typename S>
66+
class base_iterator;
8467

85-
posting_type const* m_begin;
86-
posting_type const* m_end;
87-
};
68+
using const_iterator = base_iterator<const_sequence>;
69+
using iterator = typename std::conditional<std::is_same<Source, mio::mmap_source>::value,
70+
const_iterator,
71+
base_iterator<sequence>>::type;
8872

89-
class iterator : public std::iterator<std::forward_iterator_tag,
90-
sequence> {
91-
public:
92-
iterator()
93-
: m_collection(nullptr)
94-
{}
73+
iterator begin() { return iterator(this, 0); }
74+
iterator end() { return iterator(this, m_data_size); }
75+
const_iterator begin() const { return const_iterator(this, 0); }
76+
const_iterator end() const { return const_iterator(this, m_data_size); }
77+
const_iterator cbegin() const { return const_iterator(this, 0); }
78+
const_iterator cend() const { return const_iterator(this, m_data_size); }
9579

96-
value_type const& operator*() const
97-
{
98-
return m_cur_seq;
99-
}
80+
template <typename S>
81+
class base_iterator : public std::iterator<std::forward_iterator_tag, S> {
82+
public:
83+
base_iterator() : m_collection(nullptr) {}
10084

101-
value_type const* operator->() const
102-
{
103-
return &m_cur_seq;
104-
}
85+
auto const &operator*() const { return m_cur_seq; }
10586

106-
iterator& operator++()
107-
{
87+
auto const *operator-> () const { return &m_cur_seq; }
88+
89+
base_iterator &operator++() {
10890
m_pos = m_next_pos;
10991
read();
11092
return *this;
11193
}
11294

113-
bool operator==(iterator const& other) const
114-
{
95+
bool operator==(base_iterator const &other) const {
11596
assert(m_collection == other.m_collection);
11697
return m_pos == other.m_pos;
11798
}
11899

119-
bool operator!=(iterator const& other) const
120-
{
121-
return !(*this == other);
122-
}
100+
bool operator!=(base_iterator const &other) const { return !(*this == other); }
123101

124-
private:
125-
friend class binary_collection;
102+
private:
103+
friend class base_binary_collection;
126104

127-
iterator(binary_collection const* coll, size_t pos)
128-
: m_collection(coll)
129-
, m_pos(pos)
130-
{
105+
base_iterator(base_binary_collection const *coll, size_t pos)
106+
: m_collection(coll), m_pos(pos) {
131107
read();
132108
}
133109

@@ -138,24 +114,26 @@ namespace ds2i {
138114

139115
size_t n = 0;
140116
size_t pos = m_pos;
141-
while (!(n = m_collection->m_data[pos++])); // skip empty seqs
117+
n = m_collection->m_data[pos++];
142118
// file might be truncated
143119
n = std::min(n, size_t(m_collection->m_data_size - pos));
144-
posting_type const* begin = &m_collection->m_data[pos];
145-
posting_type const* end = begin + n;
120+
auto begin = &m_collection->m_data[pos];
146121

147122
m_next_pos = pos + n;
148-
m_cur_seq = sequence(begin, end);
123+
m_cur_seq = S(begin, begin + n);
149124
}
150125

151-
binary_collection const* m_collection;
152-
size_t m_pos, m_next_pos;
153-
sequence m_cur_seq;
126+
base_binary_collection const * m_collection;
127+
size_t m_pos, m_next_pos;
128+
S m_cur_seq;
154129
};
155130

156-
private:
157-
mio::mmap_source m_file;
158-
posting_type const* m_data;
159-
size_t m_data_size;
131+
private:
132+
Source m_file;
133+
pointer m_data;
134+
size_t m_data_size;
160135
};
136+
137+
using binary_collection = base_binary_collection<>;
138+
using writable_binary_collection = base_binary_collection<mio::mmap_sink>;
161139
}

0 commit comments

Comments
 (0)