Skip to content

Commit 7fbce6d

Browse files
committed
better reading for bitpack
1 parent 1d4657f commit 7fbce6d

File tree

5 files changed

+29
-30
lines changed

5 files changed

+29
-30
lines changed

cp-algo/structures/bit_array.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ namespace cp_algo::structures {
1414
void set(size_t x) {
1515
data[x / width] |= 1ULL << (x % width);
1616
}
17+
void reset(size_t x) {
18+
data[x / width] &= ~(1ULL << (x % width));
19+
}
1720
void flip(size_t x) {
1821
data[x / width] ^= 1ULL << (x % width);
1922
}

cp-algo/structures/bitpack.hpp

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,24 @@
11
#ifndef CP_ALGO_STRUCTURES_BITPACK_HPP
22
#define CP_ALGO_STRUCTURES_BITPACK_HPP
33
#include "../structures/bit_array.hpp"
4+
#include "../util/simd.hpp"
45
#include <cstdint>
56
#include <cstddef>
67
#include <string>
78
#include <array>
89
namespace cp_algo::structures {
9-
template<size_t n, typename Int = uint64_t>
10-
struct bitpack: bit_array<n, Int> {
11-
using Base = bit_array<n, Int>;
10+
template<size_t n>
11+
struct bitpack: bit_array<n, uint64_t> {
12+
using Base = bit_array<n, uint64_t>;
1213
using Base::width, Base::blocks, Base::data;
14+
using Base::set, Base::reset;
1315
auto operator <=> (bitpack const& t) const = default;
1416

1517
bitpack() {}
16-
bitpack(std::string bits) {
17-
size_t rem = size(bits) % width;
18-
if(rem) {
19-
bits += std::string(width - rem, '0');
20-
}
21-
for(size_t i = 0, pos = 0; pos < size(bits); i++, pos += width) {
22-
for(size_t j = width; j; j--) {
23-
data[i] *= 2;
24-
data[i] ^= bits[pos + j - 1] == '1';
25-
}
18+
bitpack(std::string &bits) {
19+
bits.resize((size(bits) + width - 1) / width * width);
20+
for(size_t i = 0; i < blocks; i++) {
21+
data[i] = read_bits64(bits.data() + i * width);
2622
}
2723
}
2824

@@ -42,7 +38,7 @@ namespace cp_algo::structures {
4238
std::string to_string() const {
4339
std::string res(blocks * width, '0');
4440
for(size_t i = 0, pos = 0; i < blocks; i++, pos += width) {
45-
Int block = data[i];
41+
auto block = data[i];
4642
for(size_t j = 0; j < width; j++) {
4743
res[pos + j] = '0' + block % 2;
4844
block /= 2;

cp-algo/util/bit.hpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#ifndef CP_ALGO_UTIL_BIT_HPP
22
#define CP_ALGO_UTIL_BIT_HPP
3-
#include <immintrin.h>
3+
#include "../util/simd.hpp"
44
#include <cstdint>
55
#include <array>
66
#include <bit>
@@ -25,5 +25,12 @@ namespace cp_algo {
2525
callback.template operator()<1ULL << fl>();
2626
}
2727
}
28+
29+
[[gnu::target("avx2"), gnu::always_inline]] inline uint32_t read_bits(char const* p) {
30+
return _mm256_movemask_epi8(__m256i(vector_cast<u8x32 const>(p[0]) + (127 - '0')));
31+
}
32+
[[gnu::always_inline]] inline uint64_t read_bits64(char const* p) {
33+
return read_bits(p) | (uint64_t(read_bits(p + 32)) << 32);
34+
}
2835
}
2936
#endif // CP_ALGO_UTIL_BIT_HPP

cp-algo/util/simd.hpp

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@
77
namespace cp_algo {
88
template<typename T, size_t len>
99
using simd [[gnu::vector_size(len * sizeof(T))]] = T;
10-
using u32x8 = simd<uint32_t, 8>;
1110
using i64x4 = simd<int64_t, 4>;
1211
using u64x4 = simd<uint64_t, 4>;
12+
using u32x8 = simd<uint32_t, 8>;
1313
using i32x4 = simd<int32_t, 4>;
1414
using u32x4 = simd<uint32_t, 4>;
1515
using i16x4 = simd<int16_t, 4>;
16+
using u8x32 = simd<uint8_t, 32>;
1617
using dx4 = simd<double, 4>;
1718

1819
[[gnu::always_inline]] inline dx4 abs(dx4 a) {
@@ -44,23 +45,14 @@ namespace cp_algo {
4445
[[gnu::always_inline]] inline auto swap_bytes(auto x) {
4546
return decltype(x)(__builtin_shufflevector(u32x8(x), u32x8(x), 1, 0, 3, 2, 5, 4, 7, 6));
4647
}
47-
[[gnu::always_inline]] inline u64x4 montgomery_reduce(u64x4 x, uint32_t mod, uint32_t imod) {
48-
#ifdef __AVX2__
48+
[[gnu::target("avx2"), gnu::always_inline]] inline u64x4 montgomery_reduce(u64x4 x, uint32_t mod, uint32_t imod) {
4949
auto x_ninv = u64x4(_mm256_mul_epu32(__m256i(x), __m256i() + imod));
5050
x += u64x4(_mm256_mul_epu32(__m256i(x_ninv), __m256i() + mod));
51-
#else
52-
auto x_ninv = x * imod;
53-
x += low32(x_ninv) * mod;
54-
#endif
5551
return swap_bytes(x);
5652
}
5753

58-
[[gnu::always_inline]] inline u64x4 montgomery_mul(u64x4 x, u64x4 y, uint32_t mod, uint32_t imod) {
59-
#ifdef __AVX2__
54+
[[gnu::target("avx2"), gnu::always_inline]] inline u64x4 montgomery_mul(u64x4 x, u64x4 y, uint32_t mod, uint32_t imod) {
6055
return montgomery_reduce(u64x4(_mm256_mul_epu32(__m256i(x), __m256i(y))), mod, imod);
61-
#else
62-
return montgomery_reduce(low32(x) * low32(y), mod, imod);
63-
#endif
6456
}
6557
[[gnu::always_inline]] inline u32x8 montgomery_mul(u32x8 x, u32x8 y, uint32_t mod, uint32_t imod) {
6658
return u32x8(montgomery_mul(u64x4(x), u64x4(y), mod, imod)) |

verify/structures/bitpack/prod_mod_2.test.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,12 @@ void solve() {
4040
b[i] = row;
4141
}
4242
cp_algo::checkpoint("read");
43-
for(int j = 0; j < m; j += 64) {
44-
for(int z = 0; z < 64 / K; z++) {
43+
const int width = bitpack<maxn>::width;
44+
for(int j = 0; j < m; j += width) {
45+
for(int z = 0; z < width / K; z++) {
4546
process_precalc(j / K + z);
4647
for(int i = 0; i < n; i++) {
47-
c[i] ^= precalc[uint8_t(a[i].word(j / 64) >> K * z)];
48+
c[i] ^= precalc[uint8_t(a[i].word(j / width) >> K * z)];
4849
}
4950
}
5051
}

0 commit comments

Comments
 (0)