From cd662f3d5a653ed48a66de185a87f83905d3f25a Mon Sep 17 00:00:00 2001 From: Juniper Tyree <50025784+juntyr@users.noreply.github.com> Date: Thu, 19 Sep 2024 08:29:56 +0000 Subject: [PATCH] Start implementing the PCA codec --- Cargo.toml | 5 + codecs/pca/Cargo.toml | 31 ++++ codecs/pca/LICENSE | 1 + codecs/pca/README.md | 32 ++++ codecs/pca/src/lib.rs | 358 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 427 insertions(+) create mode 100644 codecs/pca/Cargo.toml create mode 120000 codecs/pca/LICENSE create mode 100644 codecs/pca/README.md create mode 100644 codecs/pca/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index 63cff1d6..90b414ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ members = [ "codecs/identity", "codecs/linear-quantize", "codecs/log", + "codecs/pca", "codecs/random-projection", "codecs/reinterpret", "codecs/round", @@ -40,6 +41,8 @@ numcodecs-fixed-offset-scale = { version = "0.1", path = "codecs/fixed-offset-sc numcodecs-identity = { version = "0.1", path = "codecs/identity", default-features = false } numcodecs-linear-quantize = { version = "0.1", path = "codecs/linear-quantize", default-features = false } numcodecs-log = { version = "0.2", path = "codecs/log", default-features = false } +numcodecs-pca = { version = "0.1", path = "codecs/pca", default-features = false } +numcodecs-random-projection = { version = "0.1", path = "codecs/random-projection", default-features = false } numcodecs-reinterpret = { version = "0.1", path = "codecs/reinterpret", default-features = false } numcodecs-round = { version = "0.1", path = "codecs/round", default-features = false } numcodecs-swizzle-reshape = { version = "0.1", path = "codecs/swizzle-reshape", default-features = false } @@ -53,8 +56,10 @@ numcodecs-zstd = { version = "0.1", path = "codecs/zstd", default-features = fal convert_case = { version = "0.6", default-features = false } format_serde_error = { version = "0.3", default-features = false } miniz_oxide = { version = "0.8", default-features = false } +nalgebra = { version = "0.30", default-features = false } # keep in sync with nshare ndarray = { version = "0.15", default-features = false } # keep in sync with numpy ndarray-rand = { version = "0.14", default-features = false } +nshare = { version = "0.9", default-features = false } # keep in sync with ndarray numpy = { version = "0.21", default-features = false } num-traits = { version = "0.2", default-features = false } postcard = { version = "1.0", default-features = false } diff --git a/codecs/pca/Cargo.toml b/codecs/pca/Cargo.toml new file mode 100644 index 00000000..4b7a244e --- /dev/null +++ b/codecs/pca/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "numcodecs-pca" +version = "0.1.0" +edition = { workspace = true } +authors = { workspace = true } +repository = { workspace = true } +license = { workspace = true } +rust-version = { workspace = true } + +description = "PCA codec implementation for the numcodecs API" +readme = "README.md" +categories = ["compression", "encoding"] +keywords = ["pca", "numcodecs", "compression", "encoding"] + +[dependencies] +nalgebra = { workspace = true } +ndarray = { workspace = true, features = ["std"] } +ndarray-rand = { workspace = true } +nshare = { workspace = true, features = ["ndarray", "nalgebra", "nalgebra_std"] } +numcodecs = { workspace = true } +num-traits = { workspace = true, features = ["std"] } +schemars = { workspace = true, features = ["derive", "preserve_order"] } +serde = { workspace = true, features = ["std", "derive"] } +thiserror = { workspace = true } +wyhash = { workspace = true } + +[dev-dependencies] +ndarray-rand = { workspace = true } + +[lints] +workspace = true diff --git a/codecs/pca/LICENSE b/codecs/pca/LICENSE new file mode 120000 index 00000000..30cff740 --- /dev/null +++ b/codecs/pca/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/codecs/pca/README.md b/codecs/pca/README.md new file mode 100644 index 00000000..806642e1 --- /dev/null +++ b/codecs/pca/README.md @@ -0,0 +1,32 @@ +[![CI Status]][workflow] [![MSRV]][repo] [![Latest Version]][crates.io] [![Rust Doc Crate]][docs.rs] [![Rust Doc Main]][docs] + +[CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/numcodecs-rs/ci.yml?branch=main +[workflow]: https://github.com/juntyr/numcodecs-rs/actions/workflows/ci.yml?query=branch%3Amain + +[MSRV]: https://img.shields.io/badge/MSRV-1.76.0-blue +[repo]: https://github.com/juntyr/numcodecs-rs + +[Latest Version]: https://img.shields.io/crates/v/numcodecs-pca +[crates.io]: https://crates.io/crates/numcodecs-pca + +[Rust Doc Crate]: https://img.shields.io/docsrs/numcodecs-pca +[docs.rs]: https://docs.rs/numcodecs-pca/ + +[Rust Doc Main]: https://img.shields.io/badge/docs-main-blue +[docs]: https://juntyr.github.io/numcodecs-rs/numcodecs_pca + +# numcodecs-pca + +PCA codec implementation for the [`numcodecs`] API. + +[`numcodecs`]: https://docs.rs/numcodecs/0.1/numcodecs/ + +## License + +Licensed under the Mozilla Public License, Version 2.0 ([LICENSE](LICENSE) or https://www.mozilla.org/en-US/MPL/2.0/). + +## Funding + +The `numcodecs-pca` crate has been developed as part of [ESiWACE3](https://www.esiwace.eu), the third phase of the Centre of Excellence in Simulation of Weather and Climate in Europe. + +Funded by the European Union. This work has received funding from the European High Performance Computing Joint Undertaking (JU) under grant agreement No 101093054. diff --git a/codecs/pca/src/lib.rs b/codecs/pca/src/lib.rs new file mode 100644 index 00000000..1822288b --- /dev/null +++ b/codecs/pca/src/lib.rs @@ -0,0 +1,358 @@ +//! [![CI Status]][workflow] [![MSRV]][repo] [![Latest Version]][crates.io] [![Rust Doc Crate]][docs.rs] [![Rust Doc Main]][docs] +//! +//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/numcodecs-rs/ci.yml?branch=main +//! [workflow]: https://github.com/juntyr/numcodecs-rs/actions/workflows/ci.yml?query=branch%3Amain +//! +//! [MSRV]: https://img.shields.io/badge/MSRV-1.76.0-blue +//! [repo]: https://github.com/juntyr/numcodecs-rs +//! +//! [Latest Version]: https://img.shields.io/crates/v/numcodecs-pca +//! [crates.io]: https://crates.io/crates/numcodecs-pca +//! +//! [Rust Doc Crate]: https://img.shields.io/docsrs/numcodecs-pca +//! [docs.rs]: https://docs.rs/numcodecs-pca/ +//! +//! [Rust Doc Main]: https://img.shields.io/badge/docs-main-blue +//! [docs]: https://juntyr.github.io/numcodecs-rs/numcodecs_pca +//! +//! PCA codec implementation for the [`numcodecs`] API. + +use std::{ + borrow::Cow, + num::NonZeroUsize, + ops::{MulAssign, SubAssign}, +}; + +use nalgebra::{ + linalg::{QR, SVD}, + ComplexField, Scalar, +}; +use ndarray::{Array, ArrayBase, Axis, Data, Dimension, Ix1, Ix2, ShapeError}; +use ndarray_rand::{ + rand::SeedableRng, + rand_distr::{Distribution, StandardNormal}, + RandomExt, +}; +use nshare::{ToNalgebra, ToNdarray2}; +use num_traits::{Float, FromPrimitive, Zero}; +use numcodecs::{ + AnyArray, AnyArrayAssignError, AnyArrayDType, AnyArrayView, AnyArrayViewMut, AnyCowArray, + Codec, StaticCodec, StaticCodecConfig, +}; +use schemars::{json_schema, JsonSchema, Schema, SchemaGenerator}; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use thiserror::Error; +use wyhash::WyRng; + +/// Codec that uses principal component analysis to reduce the dimensionality +/// of high-dimensional data to compress it. +/// +/// A two-dimensional array of shape `N x D` is encoded as n array of shape +/// `N x K`, where `K` is the number of principal components to keep. +/// +/// For high-dimensionality arrays, randomized PCA should be used by providing +/// a `seed` for a random number generator. +/// +/// This codec only supports finite floating point data. +#[derive(Clone, Serialize, Deserialize, JsonSchema)] +// FIXME: #[serde(deny_unknown_fields)] +pub struct PCACodec { + /// The number of principal components to keep for the dimensionality of the + /// encoded data + pub k: NonZeroUsize, + /// Optional seed for using randomized PCA instead of full PCA + #[serde(default)] + pub seed: Option, + /// Tolerance for the SVD solver when checking for convergence, 0.0 by default + #[serde(default = "NonNegative::zero")] + pub svd_tolerance: NonNegative, + /// Optional maximum number of iterations before the SVD solver fails. + /// + /// By default, the solver tries indefinitely. + #[serde(default)] + pub svd_max_iterations: Option, +} + +impl Codec for PCACodec { + type Error = PCACodecError; + + fn encode(&self, data: AnyCowArray) -> Result { + match data { + AnyCowArray::F32(data) => Ok(AnyArray::F32( + project_with_projection( + data, + self.k, + self.seed, + self.svd_tolerance, + self.svd_max_iterations, + )? + .into_dyn(), + )), + AnyCowArray::F64(data) => Ok(AnyArray::F64( + project_with_projection( + data, + self.k, + self.seed, + self.svd_tolerance, + self.svd_max_iterations, + )? + .into_dyn(), + )), + encoded => Err(PCACodecError::UnsupportedDtype(encoded.dtype())), + } + } + + fn decode(&self, encoded: AnyCowArray) -> Result { + match encoded { + AnyCowArray::F32(encoded) => Ok(AnyArray::F32(unimplemented!())), + AnyCowArray::F64(encoded) => Ok(AnyArray::F64(unimplemented!())), + encoded => Err(PCACodecError::UnsupportedDtype(encoded.dtype())), + } + } + + fn decode_into( + &self, + encoded: AnyArrayView, + decoded: AnyArrayViewMut, + ) -> Result<(), Self::Error> { + match (encoded, decoded) { + (AnyArrayView::F32(encoded), AnyArrayViewMut::F32(decoded)) => { + unimplemented!() + } + (AnyArrayView::F64(encoded), AnyArrayViewMut::F64(decoded)) => { + unimplemented!() + } + (encoded @ (AnyArrayView::F32(_) | AnyArrayView::F64(_)), decoded) => { + Err(PCACodecError::MismatchedDecodeIntoArray { + source: AnyArrayAssignError::DTypeMismatch { + src: encoded.dtype(), + dst: decoded.dtype(), + }, + }) + } + (encoded, _decoded) => Err(PCACodecError::UnsupportedDtype(encoded.dtype())), + } + } +} + +impl StaticCodec for PCACodec { + const CODEC_ID: &'static str = "pca"; + + type Config<'de> = Self; + + fn from_config(config: Self::Config<'_>) -> Self { + config + } + + fn get_config(&self) -> StaticCodecConfig { + StaticCodecConfig::from(self) + } +} + +#[derive(Debug, Error)] +/// Errors that may occur when applying the [`PCACodec`]. +pub enum PCACodecError { + /// [`PCACodec`] does not support the dtype + #[error("PCA does not support the dtype {0}")] + UnsupportedDtype(AnyArrayDType), + /// [`PCACodec`] does not support the dtype + #[error("PCA only supports matrix / 2d-shaped arrays")] + NonMatrixData { + /// The source of the error + #[from] + source: ShapeError, + }, + /// [`PCACodec`] does not support non-finite (infinite or NaN) + /// floating point data + #[error("PCA does not support non-finite (infinite or NaN) floating point data")] + NonFiniteData, + /// [`PCACodec`] cannot encode or decode from an array with `N` + /// samples to an array with a different number of samples + #[error("PCA cannot encode or decode from an array with {input} samples to an array with {output} samples")] + NumberOfSamplesMismatch { + /// Number of samples `N` in the input array + input: usize, + /// Number of samples `N` in the output array + output: usize, + }, + /// [`PCACodec`] cannot decode from an array with zero + /// dimensionality `K` + #[error("PCA cannot decode from an array with zero dimensionality `K`")] + ProjectedArrayZeroComponents, + /// [`PCACodec`] cannot decode from an array with corrupted + /// dimensionality metadata + #[error("PCA cannot decode from an array with corrupted dimensionality metadata")] + CorruptedNumberOfComponents, + /// [`PCACodec`] cannot decode into an array with `D` features + /// that differs from the `D` stored in the encoded metadata + #[error("PCA cannot decode into an array with {output} features that differs from the {metadata} features stored in the encoded metadata")] + NumberOfFeaturesMismatch { + /// Number of features `D` in the encoded array metadata + metadata: usize, + /// Number of features `D` in the decoded output array + output: usize, + }, + /// [`PCACodec`] cannot decode into the provided array + #[error("PCA cannot decode into the provided array")] + MismatchedDecodeIntoArray { + /// The source of the error + #[from] + source: AnyArrayAssignError, + }, +} + +pub fn project_with_projection, D: Dimension>( + data: ArrayBase, + k: NonZeroUsize, + seed: Option, + svd_tolerance: NonNegative, + svd_max_iterations: Option, +) -> Result, PCACodecError> +where + StandardNormal: Distribution, +{ + let data: ArrayBase = data + .into_dimensionality() + .map_err(|err| PCACodecError::NonMatrixData { source: err })?; + + let (n, d) = data.dim(); + + let Some(mean): Option> = data.mean_axis(Axis(0)) else { + return Ok(Array::zeros((n, k.get()))); + }; + + let mut centred_data = data.into_owned(); + centred_data.sub_assign(&mean); + + let centred_data_matrix = centred_data.view_mut().into_nalgebra(); + + // adapted from https://github.com/ekg/rsvd/blob/a44fd1584144f8f60c2a0b872edcb47b8b64d769/src/lib.rs#L39-L78 + // published under the MIT License by Erik Garrison + let (projection, projected) = if let Some(seed) = seed { + let mut rng = WyRng::seed_from_u64(seed); + + // number of oversamples + const P: usize = 10; + let l = k.get() + P; + + // generate the Gaussian random test matrix + let omega = Array::random_using((d, l), StandardNormal, &mut rng); + let omega = omega.view().into_nalgebra(); + + // form the sample matrix Y + let y = (¢red_data_matrix) * omega; + + // orthogonalize Y + let (q, _r) = QR::new(y).unpack(); + + // project the input data to the lower dimension + let b = q.transpose() * (¢red_data_matrix); + + // compute the SVD of the small matrix B + let Some(SVD { v_t: Some(v_t), .. }) = SVD::try_new( + b, + false, + true, + ::from_f64(svd_tolerance.0), + svd_max_iterations.map_or(0, NonZeroUsize::get), + ) else { + panic!("SVD failed"); + }; + + // truncated PCs + let projection = v_t; + let projected = centred_data_matrix * (&projection); + + (projection.into_ndarray2(), projected.into_ndarray2()) + } else { + let Some(SVD { v_t: Some(v_t), .. }) = SVD::try_new( + centred_data_matrix.clone_owned(), + false, + true, + ::from_f64(svd_tolerance.0), + svd_max_iterations.map_or(0, NonZeroUsize::get), + ) else { + panic!("full SVD failed"); + }; + + // full SVD, needs to be truncated + let projection = v_t.slice((0, 0), (k.get(), d)).into_owned(); + let projected = centred_data_matrix * (&projection); + + (projection.into_ndarray2(), projected.into_ndarray2()) + }; + + todo!(); +} + +/// Floating point types. +pub trait FloatExt: + Float + FromPrimitive + SubAssign + Scalar + MulAssign + ComplexField +{ + /// Converts from a [`f64`]. + #[must_use] + fn from_f64(x: f64) -> Self; +} + +impl FloatExt for f32 { + #[allow(clippy::cast_possible_truncation)] + fn from_f64(x: f64) -> Self { + x as Self + } +} + +impl FloatExt for f64 { + fn from_f64(x: f64) -> Self { + x + } +} + +#[allow(clippy::derive_partial_eq_without_eq)] // floats are not Eq +#[derive(Copy, Clone, PartialEq, PartialOrd, Hash)] +/// Non-negative floating point number +pub struct NonNegative(T); + +impl NonNegative { + #[must_use] + pub fn zero() -> Self { + Self(T::zero()) + } +} + +impl Serialize for NonNegative { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_f64(self.0) + } +} + +impl<'de> Deserialize<'de> for NonNegative { + fn deserialize>(deserializer: D) -> Result { + let x = f64::deserialize(deserializer)?; + + if x >= 0.0 { + Ok(Self(x)) + } else { + Err(serde::de::Error::invalid_value( + serde::de::Unexpected::Float(x), + &"a non-negative value", + )) + } + } +} + +impl JsonSchema for NonNegative { + fn schema_name() -> Cow<'static, str> { + Cow::Borrowed("NonNegativeF64") + } + + fn schema_id() -> Cow<'static, str> { + Cow::Borrowed(concat!(module_path!(), "::", "NonNegative")) + } + + fn json_schema(_gen: &mut SchemaGenerator) -> Schema { + json_schema!({ + "type": "number", + "minimum": 0.0 + }) + } +}