rust-ml · quietlychris · Mar 30, 2024 · Feb 17, 2024 · Feb 17, 2024 · Feb 17, 2024
diff --git a/algorithms/linfa-reduction/Cargo.toml b/algorithms/linfa-reduction/Cargo.toml
@@ -1,7 +1,10 @@
 [package]
 name = "linfa-reduction"
 version = "0.7.0"
-authors = ["Lorenz Schmidt <[email protected]>"]
+authors = [
+    "Lorenz Schmidt <[email protected]>",
+    "Gabriel Bathie <[email protected]>",
+]
 description = "A collection of dimensionality reduction techniques"
 edition = "2018"
 license = "MIT OR Apache-2.0"
@@ -41,6 +44,8 @@ rand = { version = "0.8", features = ["small_rng"] }
 
 linfa = { version = "0.7.0", path = "../.." }
 linfa-kernel = { version = "0.7.0", path = "../linfa-kernel" }
+sprs = "0.11.1"
+rand_xoshiro = "0.6.0"
 
 [dev-dependencies]
 ndarray-npy = { version = "0.8", default-features = false }
@@ -49,3 +54,5 @@ linfa-datasets = { version = "0.7.0", path = "../../datasets", features = [
     "generate",
 ] }
 approx = { version = "0.4" }
+mnist = { version = "0.6.0", features = ["download"] }
+linfa-trees = { version = "0.7.0", path = "../linfa-trees"}
diff --git a/algorithms/linfa-reduction/README.md b/algorithms/linfa-reduction/README.md
@@ -11,6 +11,8 @@
 `linfa-reduction` currently provides an implementation of the following dimensional reduction methods: 
 - Diffusion Mapping
 - Principal Component Analysis (PCA)
+- Gaussian random projections
+- Sparse random projections
 
 ## Examples
 
@@ -19,6 +21,8 @@ There is an usage example in the `examples/` directory. To run, use:
 ```bash
 $ cargo run --release --example diffusion_map
 $ cargo run --release --example pca
+$ cargo run --release --example gaussian_projection
+$ cargo run --release --example sparse_projection
 ```
 
 ## BLAS/LAPACK backend

diff --git a/algorithms/linfa-reduction/examples/gaussian_projection.rs b/algorithms/linfa-reduction/examples/gaussian_projection.rs
@@ -0,0 +1,76 @@
+use std::{error::Error, time::Instant};
+
+use linfa::prelude::*;
+use linfa_reduction::random_projection::GaussianRandomProjection;
+use linfa_trees::{DecisionTree, SplitQuality};
+
+use mnist::{MnistBuilder, NormalizedMnist};
+use ndarray::{Array1, Array2};
+use rand::SeedableRng;
+use rand_xoshiro::Xoshiro256Plus;
+
+/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
+fn main() -> Result<(), Box<dyn Error>> {
+    // Parameters
+    let train_sz = 10_000usize;
+    let test_sz = 1_000usize;
+    let reduced_dim = 100;
+    let rng = Xoshiro256Plus::seed_from_u64(42);
+
+    let NormalizedMnist {
+        trn_img,
+        trn_lbl,
+        tst_img,
+        tst_lbl,
+        ..
+    } = MnistBuilder::new()
+        .label_format_digit()
+        .training_set_length(train_sz as u32)
+        .test_set_length(test_sz as u32)
+        .download_and_extract()
+        .finalize()
+        .normalize();
+
+    let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?;
+    let train_labels: Array1<usize> =
+        Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize);
+    let train_dataset = Dataset::new(train_data, train_labels);
+
+    let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?;
+    let test_labels: Array1<usize> = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize);
+
+    let params = DecisionTree::params()
+        .split_quality(SplitQuality::Gini)
+        .max_depth(Some(10));
+
+    println!("Training non-reduced model...");
+    let start = Instant::now();
+    let model: DecisionTree<f32, usize> = params.fit(&train_dataset)?;
+
+    let end = start.elapsed();
+    let pred_y = model.predict(&test_data);
+    let cm = pred_y.confusion_matrix(&test_labels)?;
+    println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy());
+    println!("Training time: {:.2}s\n", end.as_secs_f32());
+
+    println!("Training reduced model...");
+    let start = Instant::now();
+    // Compute the random projection and train the model on the reduced dataset.
+    let proj = GaussianRandomProjection::<f32>::params_with_rng(rng)
+        .target_dim(reduced_dim)
+        .fit(&train_dataset)?;
+    let reduced_train_ds = proj.transform(&train_dataset);
+    let reduced_test_data = proj.transform(&test_data);
+    let model_reduced: DecisionTree<f32, usize> = params.fit(&reduced_train_ds)?;
+
+    let end = start.elapsed();
+    let pred_reduced = model_reduced.predict(&reduced_test_data);
+    let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?;
+    println!(
+        "Reduced model precision: {}%",
+        100.0 * cm_reduced.accuracy()
+    );
+    println!("Reduction + training time: {:.2}s", end.as_secs_f32());
+
+    Ok(())
+}
diff --git a/algorithms/linfa-reduction/examples/sparse_projection.rs b/algorithms/linfa-reduction/examples/sparse_projection.rs
@@ -0,0 +1,76 @@
+use std::{error::Error, time::Instant};
+
+use linfa::prelude::*;
+use linfa_reduction::random_projection::SparseRandomProjection;
+use linfa_trees::{DecisionTree, SplitQuality};
+
+use mnist::{MnistBuilder, NormalizedMnist};
+use ndarray::{Array1, Array2};
+use rand::SeedableRng;
+use rand_xoshiro::Xoshiro256Plus;
+
+/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
+fn main() -> Result<(), Box<dyn Error>> {
+    // Parameters
+    let train_sz = 10_000usize;
+    let test_sz = 1_000usize;
+    let reduced_dim = 100;
+    let rng = Xoshiro256Plus::seed_from_u64(42);
+
+    let NormalizedMnist {
+        trn_img,
+        trn_lbl,
+        tst_img,
+        tst_lbl,
+        ..
+    } = MnistBuilder::new()
+        .label_format_digit()
+        .training_set_length(train_sz as u32)
+        .test_set_length(test_sz as u32)
+        .download_and_extract()
+        .finalize()
+        .normalize();
+
+    let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?;
+    let train_labels: Array1<usize> =
+        Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize);
+    let train_dataset = Dataset::new(train_data, train_labels);
+
+    let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?;
+    let test_labels: Array1<usize> = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize);
+
+    let params = DecisionTree::params()
+        .split_quality(SplitQuality::Gini)
+        .max_depth(Some(10));
+
+    println!("Training non-reduced model...");
+    let start = Instant::now();
+    let model: DecisionTree<f32, usize> = params.fit(&train_dataset)?;
+
+    let end = start.elapsed();
+    let pred_y = model.predict(&test_data);
+    let cm = pred_y.confusion_matrix(&test_labels)?;
+    println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy());
+    println!("Training time: {:.2}s\n", end.as_secs_f32());
+
+    println!("Training reduced model...");
+    let start = Instant::now();
+    // Compute the random projection and train the model on the reduced dataset.
+    let proj = SparseRandomProjection::<f32>::params_with_rng(rng)
+        .target_dim(reduced_dim)
+        .fit(&train_dataset)?;
+    let reduced_train_ds = proj.transform(&train_dataset);
+    let reduced_test_data = proj.transform(&test_data);
+    let model_reduced: DecisionTree<f32, usize> = params.fit(&reduced_train_ds)?;
+
+    let end = start.elapsed();
+    let pred_reduced = model_reduced.predict(&reduced_test_data);
+    let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?;
+    println!(
+        "Reduced model precision: {}%",
+        100.0 * cm_reduced.accuracy()
+    );
+    println!("Reduction + training time: {:.2}s", end.as_secs_f32());
+
+    Ok(())
+}
diff --git a/algorithms/linfa-reduction/src/error.rs b/algorithms/linfa-reduction/src/error.rs
@@ -18,4 +18,12 @@ pub enum ReductionError {
     LinalgError(#[from] linfa_linalg::LinalgError),
     #[error(transparent)]
     LinfaError(#[from] linfa::error::Error),
+    #[error(transparent)]
+    NdarrayRandError(#[from] ndarray_rand::rand_distr::NormalError),
+    #[error("Precision parameter must be in the interval (0; 1)")]
+    InvalidPrecision,
+    #[error("Target dimension of the projection must be positive")]
+    NonPositiveEmbeddingSize,
+    #[error("Target dimension {0} is larger than the number of features {1}.")]
+    DimensionIncrease(usize, usize),
 }
diff --git a/algorithms/linfa-reduction/src/lib.rs b/algorithms/linfa-reduction/src/lib.rs
@@ -6,6 +6,7 @@ extern crate ndarray;
 mod diffusion_map;
 mod error;
 mod pca;
+pub mod random_projection;
 pub mod utils;
 
 pub use diffusion_map::{DiffusionMap, DiffusionMapParams, DiffusionMapValidParams};

diff --git a/algorithms/linfa-reduction/src/random_projection/common.rs b/algorithms/linfa-reduction/src/random_projection/common.rs
@@ -0,0 +1,38 @@
+/// Compute a safe dimension for a projection with precision `eps`,
+/// using the Johnson-Lindestrauss Lemma.
+///
+/// References:
+/// - [D. Achlioptas, JCSS](https://www.sciencedirect.com/science/article/pii/S0022000003000254)
+/// - [Li et al., SIGKDD'06](https://hastie.su.domains/Papers/Ping/KDD06_rp.pdf)
+pub(crate) fn johnson_lindenstrauss_min_dim(n_samples: usize, eps: f64) -> usize {
+    let log_samples = (n_samples as f64).ln();
+    let value = 4. * log_samples / (eps.powi(2) / 2. - eps.powi(3) / 3.);
+    value as usize
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    /// Test against values computed by the scikit-learn implementation
+    /// of `johnson_lindenstrauss_min_dim`.
+    fn test_johnson_lindenstrauss() {
+        assert_eq!(johnson_lindenstrauss_min_dim(100, 0.05), 15244);
+        assert_eq!(johnson_lindenstrauss_min_dim(100, 0.1), 3947);
+        assert_eq!(johnson_lindenstrauss_min_dim(100, 0.2), 1062);
+        assert_eq!(johnson_lindenstrauss_min_dim(100, 0.5), 221);
+        assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.05), 22867);
+        assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.1), 5920);
+        assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.2), 1594);
+        assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.5), 331);
+        assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.05), 28194);
+        assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.1), 7300);
+        assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.2), 1965);
+        assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.5), 408);
+        assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.05), 30489);
+        assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.1), 7894);
+        assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.2), 2125);
+        assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.5), 442);
+    }
+}
diff --git a/algorithms/linfa-reduction/src/random_projection/gaussian/algorithms.rs b/algorithms/linfa-reduction/src/random_projection/gaussian/algorithms.rs
@@ -0,0 +1,77 @@
+use linfa::{prelude::Records, traits::Fit, Float};
+use ndarray::{Array, Array2, Ix2};
+use ndarray_rand::{
+    rand_distr::{Normal, StandardNormal},
+    RandomExt,
+};
+use rand::{prelude::Distribution, Rng, SeedableRng};
+use rand_xoshiro::Xoshiro256Plus;
+
+use super::super::common::johnson_lindenstrauss_min_dim;
+use super::hyperparams::GaussianRandomProjectionParamsInner;
+use super::{GaussianRandomProjectionParams, GaussianRandomProjectionValidParams};
+use crate::{impl_proj, ReductionError};
+
+/// Embedding via Gaussian random projection
+pub struct GaussianRandomProjection<F: Float> {
+    projection: Array2<F>,
+}
+
+impl<F, Rec, T, R> Fit<Rec, T, ReductionError> for GaussianRandomProjectionValidParams<R>
+where
+    F: Float,
+    Rec: Records<Elem = F>,
+    R: Rng + Clone,
+    StandardNormal: Distribution<F>,
+{
+    type Object = GaussianRandomProjection<F>;
+
+    fn fit(&self, dataset: &linfa::DatasetBase<Rec, T>) -> Result<Self::Object, ReductionError> {
+        let n_samples = dataset.nsamples();
+        let n_features = dataset.nfeatures();
+        let mut rng = self.rng.clone();
+
+        let n_dims = match &self.params {
+            GaussianRandomProjectionParamsInner::Dimension { target_dim } => *target_dim,
+            GaussianRandomProjectionParamsInner::Epsilon { eps } => {
+                johnson_lindenstrauss_min_dim(n_samples, *eps)
+            }
+        };
+
+        if n_dims > n_features {
+            return Err(ReductionError::DimensionIncrease(n_dims, n_features));
+        }
+
+        let std_dev = F::cast(n_features).sqrt().recip();
+        let gaussian = Normal::new(F::zero(), std_dev)?;
+
+        let proj = Array::random_using((n_features, n_dims), gaussian, &mut rng);
+
+        Ok(GaussianRandomProjection { projection: proj })
+    }
+}
+
+impl<F: Float> GaussianRandomProjection<F> {
+    /// Create new parameters for a [`GaussianRandomProjection`] with default value
+    /// `eps = 0.1` and a [`Xoshiro256Plus`] RNG.
+    pub fn params() -> GaussianRandomProjectionParams<Xoshiro256Plus> {
+        GaussianRandomProjectionParams(GaussianRandomProjectionValidParams {
+            params: GaussianRandomProjectionParamsInner::Epsilon { eps: 0.1 },
+            rng: Xoshiro256Plus::seed_from_u64(42),
+        })
+    }
+
+    /// Create new parameters for a [`GaussianRandomProjection`] with default values
+    /// `eps = 0.1` and the provided [`Rng`].
+    pub fn params_with_rng<R>(rng: R) -> GaussianRandomProjectionParams<R>
+    where
+        R: Rng + Clone,
+    {
+        GaussianRandomProjectionParams(GaussianRandomProjectionValidParams {
+            params: GaussianRandomProjectionParamsInner::Epsilon { eps: 0.1 },
+            rng,
+        })
+    }
+}
+
+impl_proj! {GaussianRandomProjection<F>}