Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce Checksumming #299

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 136 additions & 20 deletions heed/src/envs/env_open_options.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#[cfg(master3)]
use std::any::TypeId;
use std::ffi::CString;
#[cfg(windows)]
use std::ffi::OsStr;
Expand All @@ -14,10 +16,12 @@ use std::{io, ptr};
use aead::{generic_array::typenum::Unsigned, AeadCore, AeadMutInPlace, Key, KeyInit};
use synchronoise::SignalEvent;

#[cfg(master3)]
use super::checksum_func_wrapper;
#[cfg(master3)]
use super::encrypted_env::{encrypt_func_wrapper, EncryptedEnv};
use super::env::Env;
use super::{canonicalize_path, OPENED_ENV};
use super::{canonicalize_path, Checksum, NoChecksum, OPENED_ENV};
#[cfg(windows)]
use crate::envs::OsStrExtLmdb as _;
use crate::mdb::error::mdb_result;
Expand All @@ -28,28 +32,28 @@ use crate::{EnvFlags, Error, Result};
/// Options and flags which can be used to configure how an environment is opened.
#[derive(Debug, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct EnvOpenOptions<T: TlsUsage> {
pub struct EnvOpenOptions<T: TlsUsage, C: Checksum> {
map_size: Option<usize>,
max_readers: Option<u32>,
max_dbs: Option<u32>,
flags: EnvFlags,
_tls_marker: PhantomData<T>,
_marker: PhantomData<(T, C)>,
}

impl EnvOpenOptions<WithTls> {
impl EnvOpenOptions<WithTls, NoChecksum> {
/// Creates a blank new set of options ready for configuration.
pub fn new() -> EnvOpenOptions<WithTls> {
pub fn new() -> EnvOpenOptions<WithTls, NoChecksum> {
EnvOpenOptions {
map_size: None,
max_readers: None,
max_dbs: None,
flags: EnvFlags::empty(),
_tls_marker: PhantomData,
_marker: PhantomData,
}
}
}

impl<T: TlsUsage> EnvOpenOptions<T> {
impl<T: TlsUsage, C: Checksum + 'static> EnvOpenOptions<T, C> {
/// Make the read transactions `!Send` by specifying they will
/// use Thread Local Storage (TLS). It is often faster to open
/// TLS-backed transactions.
Expand Down Expand Up @@ -81,9 +85,9 @@ impl<T: TlsUsage> EnvOpenOptions<T> {
/// is_sendable(rtxn);
/// # Ok(()) }
/// ```
pub fn read_txn_with_tls(self) -> EnvOpenOptions<WithTls> {
let Self { map_size, max_readers, max_dbs, flags, _tls_marker: _ } = self;
EnvOpenOptions { map_size, max_readers, max_dbs, flags, _tls_marker: PhantomData }
pub fn read_txn_with_tls(self) -> EnvOpenOptions<WithTls, C> {
let Self { map_size, max_readers, max_dbs, flags, _marker: _ } = self;
EnvOpenOptions { map_size, max_readers, max_dbs, flags, _marker: PhantomData }
}

/// Make the read transactions `Send` by specifying they will
Expand Down Expand Up @@ -126,9 +130,111 @@ impl<T: TlsUsage> EnvOpenOptions<T> {
/// is_sendable(rtxn);
/// # Ok(()) }
/// ```
pub fn read_txn_without_tls(self) -> EnvOpenOptions<WithoutTls> {
let Self { map_size, max_readers, max_dbs, flags, _tls_marker: _ } = self;
EnvOpenOptions { map_size, max_readers, max_dbs, flags, _tls_marker: PhantomData }
pub fn read_txn_without_tls(self) -> EnvOpenOptions<WithoutTls, C> {
let Self { map_size, max_readers, max_dbs, flags, _marker: _ } = self;
EnvOpenOptions { map_size, max_readers, max_dbs, flags, _marker: PhantomData }
}

#[cfg(master3)]
/// Changes the checksum algorithm to use.
///
/// # Basic Example
///
/// Creates and open a database. The [`Env`] is using a [`crc`](https://github.com/mrhooray/crc-rs)
/// algorithm.
///
/// Note that you cannot use **any** type of crc algorithm as it is possible to tell
/// the size of the crc to LMDB.
///
/// ```
/// use std::fs;
/// use std::path::Path;
/// use memchr::memmem::find;
/// use argon2::Argon2;
/// use chacha20poly1305::{ChaCha20Poly1305, Key};
/// use heed3::types::*;
/// use heed3::{EnvOpenOptions, Checksum, Database, Error, MdbError};
///
/// /// A checksum algorithm based on the well-known CRC_32_BZIP2.
/// enum Crc32Bzip2 {}
///
/// impl Checksum for Crc32Bzip2 {
/// // Be careful the size is in bytes not bits.
/// const SIZE: u32 = 32 / 8;
///
/// fn checksum(input: &[u8], output: &mut [u8], _key: Option<&[u8]>) {
/// let sum = crc::Crc::<u32>::new(&crc::CRC_32_BZIP2).checksum(input);
/// eprintln!("checksumming {input:?} which gives {sum:?}");
/// output.copy_from_slice(&sum.to_ne_bytes());
/// }
/// }
///
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
/// let env_path = tempfile::tempdir()?;
/// let password = "This is the password that will be hashed by the argon2 algorithm";
/// let salt = "The salt added to the password hashes to add more security when stored";
///
/// fs::create_dir_all(&env_path)?;
///
/// let mut key = Key::default();
/// Argon2::default().hash_password_into(password.as_bytes(), salt.as_bytes(), &mut key)?;
///
/// // We open the environment
/// let mut options = EnvOpenOptions::new().checksum::<Crc32Bzip2>();
/// let env = unsafe {
/// options
/// .map_size(10 * 1024 * 1024) // 10MB
/// .max_dbs(3)
/// .open_encrypted::<ChaCha20Poly1305, _>(key, &env_path)?
/// };
///
/// let key1 = "first-key";
/// let val1 = "this is my first value";
/// let key2 = "second-key";
/// let val2 = "this is a second information";
///
/// // We create a database and write values in it
/// let mut wtxn = env.write_txn()?;
/// let db = env.create_database::<Str, Str>(&mut wtxn, Some("first"))?;
/// db.put(&mut wtxn, key1, val1)?;
/// db.put(&mut wtxn, key2, val2)?;
/// wtxn.commit()?;
///
/// // We check that we can read the values back
/// let mut rtxn = env.read_txn()?;
/// assert_eq!(db.get(&mut rtxn, key1)?, Some(val1));
/// assert_eq!(db.get(&mut rtxn, key2)?, Some(val2));
/// drop(rtxn);
///
/// // We close the env and check that we can read in it
/// env.prepare_for_closing().wait();
///
/// // We modify the content of the data file
/// let mut content = fs::read(env_path.path().join("data.mdb"))?;
/// let pos = find(&content, b"value").unwrap();
/// content[pos..pos + 5].copy_from_slice(b"thing");
/// fs::write(env_path.path().join("data.mdb"), content)?;
///
/// // We reopen the environment
/// let mut options = EnvOpenOptions::new().checksum::<Crc32Bzip2>();
/// let env = unsafe {
/// options
/// .map_size(10 * 1024 * 1024) // 10MB
/// .max_dbs(3)
/// .open_encrypted::<ChaCha20Poly1305, _>(key, &env_path)?
/// };
///
/// // We check that we can read the values back
/// let mut rtxn = env.read_txn()?;
/// let db = env.open_database::<Str, Str>(&rtxn, Some("first"))?.unwrap();
/// assert!(matches!(db.get(&mut rtxn, key1).unwrap_err(), Error::Mdb(MdbError::BadChecksum)));
/// drop(rtxn);
///
/// # Ok(()) }
/// ```
pub fn checksum<NC: Checksum>(self) -> EnvOpenOptions<T, NC> {
let Self { map_size, max_readers, max_dbs, flags, _marker } = self;
EnvOpenOptions { map_size, max_readers, max_dbs, flags, _marker: PhantomData }
}

/// Set the size of the memory map to use for this environment.
Expand Down Expand Up @@ -233,7 +339,7 @@ impl<T: TlsUsage> EnvOpenOptions<T> {
/// [^7]: <https://github.com/LMDB/lmdb/blob/b8e54b4c31378932b69f1298972de54a565185b1/libraries/liblmdb/lmdb.h#L102-L105>
/// [^8]: <http://www.lmdb.tech/doc/index.html>
pub unsafe fn open<P: AsRef<Path>>(&self, path: P) -> Result<Env<T>> {
self.raw_open_with_encryption(
self.raw_open_with_checksum_and_encryption(
path.as_ref(),
#[cfg(master3)]
None,
Expand Down Expand Up @@ -390,14 +496,14 @@ impl<T: TlsUsage> EnvOpenOptions<T> {
E: AeadMutInPlace + KeyInit,
P: AsRef<Path>,
{
self.raw_open_with_encryption(
self.raw_open_with_checksum_and_encryption(
path.as_ref(),
Some((Some(encrypt_func_wrapper::<E>), &key, <E as AeadCore>::TagSize::U32)),
)
.map(|inner| EncryptedEnv { inner })
}

fn raw_open_with_encryption(
fn raw_open_with_checksum_and_encryption(
&self,
path: &Path,
#[cfg(master3)] enc: Option<(ffi::MDB_enc_func, &[u8], u32)>,
Expand Down Expand Up @@ -437,6 +543,16 @@ impl<T: TlsUsage> EnvOpenOptions<T> {
))?;
}

#[cfg(master3)]
if TypeId::of::<C>() != TypeId::of::<NoChecksum>() {
eprintln!("Doing some checksumming stuff");
mdb_result(ffi::mdb_env_set_checksum(
env,
Some(checksum_func_wrapper::<C>),
C::SIZE,
))?;
}

if let Some(size) = self.map_size {
if size % page_size::get() != 0 {
let msg = format!(
Expand Down Expand Up @@ -482,15 +598,15 @@ impl<T: TlsUsage> EnvOpenOptions<T> {
}
}

impl Default for EnvOpenOptions<WithTls> {
impl Default for EnvOpenOptions<WithTls, NoChecksum> {
fn default() -> Self {
Self::new()
}
}

impl<T: TlsUsage> Clone for EnvOpenOptions<T> {
impl<T: TlsUsage, C: Checksum> Clone for EnvOpenOptions<T, C> {
fn clone(&self) -> Self {
let Self { map_size, max_readers, max_dbs, flags, _tls_marker } = *self;
EnvOpenOptions { map_size, max_readers, max_dbs, flags, _tls_marker }
let Self { map_size, max_readers, max_dbs, flags, _marker } = *self;
EnvOpenOptions { map_size, max_readers, max_dbs, flags, _marker }
}
}
48 changes: 48 additions & 0 deletions heed/src/envs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,3 +288,51 @@ impl FlagSetMode {
}
}
}

/// A trait defining how to calculate checksum within the environment.
///
/// Enabling checksumming is not supported in the heed crate and
/// can only be modified within the heed3 crate.
pub trait Checksum {
/// The size of computed checksum values, in bytes.
const SIZE: u32;

/// Compute the checksum of the data in input and store the
/// result in output, an optional key may be used with keyed
/// hash algorithms.
///
/// The key parameter is an encryption key, if encryption was
/// configured. This parameter will be NULL if there is no key.
fn checksum(input: &[u8], output: &mut [u8], key: Option<&[u8]>);
}

/// Deactivate environment checksumming.
///
/// Enabling checksumming is not supported in the heed crate and
/// can only be modified within the heed3 crate.
pub enum NoChecksum {}

impl Checksum for NoChecksum {
const SIZE: u32 = 0;
fn checksum(_input: &[u8], _output: &mut [u8], _key: Option<&[u8]>) {}
}

/// The wrapper function that is called by LMDB that directly calls
/// the Rust idiomatic function internally.
#[cfg(master3)]
unsafe extern "C" fn checksum_func_wrapper<C: Checksum>(
src: *const ffi::MDB_val,
dst: *mut ffi::MDB_val,
key_ptr: *const ffi::MDB_val,
) {
let result = std::panic::catch_unwind(|| {
let input = ffi::from_val(*src);
let output = ffi::from_val_mut(*dst);
let key = if key_ptr.is_null() { None } else { Some(ffi::from_val(*key_ptr)) };
C::checksum(input, output, key)
});

if result.is_err() {
std::process::abort();
}
}
4 changes: 2 additions & 2 deletions heed/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ pub use self::databases::{EncryptedDatabase, EncryptedDatabaseOpenOptions};
#[cfg(master3)]
pub use self::envs::EncryptedEnv;
pub use self::envs::{
env_closing_event, CompactionOption, DefaultComparator, Env, EnvClosingEvent, EnvInfo,
EnvOpenOptions, FlagSetMode, IntegerComparator,
env_closing_event, Checksum, CompactionOption, DefaultComparator, Env, EnvClosingEvent,
EnvInfo, EnvOpenOptions, FlagSetMode, IntegerComparator, NoChecksum,
};
pub use self::iterator::{
RoIter, RoPrefix, RoRange, RoRevIter, RoRevPrefix, RoRevRange, RwIter, RwPrefix, RwRange,
Expand Down
6 changes: 5 additions & 1 deletion heed/src/mdb/lmdb_ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pub use ffi::{
MDB_RDONLY, MDB_RESERVE,
};
#[cfg(master3)]
pub use ffi::{mdb_env_set_encrypt, MDB_enc_func};
pub use ffi::{mdb_env_set_checksum, mdb_env_set_encrypt, MDB_enc_func};
#[cfg(master3)]
use lmdb_master3_sys as ffi;
#[cfg(not(master3))]
Expand Down Expand Up @@ -46,3 +46,7 @@ pub unsafe fn into_val(value: &[u8]) -> ffi::MDB_val {
pub unsafe fn from_val<'a>(value: ffi::MDB_val) -> &'a [u8] {
std::slice::from_raw_parts(value.mv_data as *const u8, value.mv_size)
}

pub unsafe fn from_val_mut<'a>(value: ffi::MDB_val) -> &'a mut [u8] {
std::slice::from_raw_parts_mut(value.mv_data as *mut u8, value.mv_size)
}
2 changes: 2 additions & 0 deletions heed3/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ synchronoise = "1.0.1"
[dev-dependencies]
# TODO update dependencies
argon2 = { version = "0.5.3", features = ["std"] }
crc = "3.2.1"
memchr = "2.7.4"
serde = { version = "1.0.215", features = ["derive"] }
chacha20poly1305 = "0.10.1"
tempfile = "3.14.0"
Expand Down
Loading