Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: implement Random Projections #332

Merged
merged 8 commits into from
Mar 30, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion algorithms/linfa-reduction/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
[package]
name = "linfa-reduction"
version = "0.7.0"
authors = ["Lorenz Schmidt <[email protected]>"]
authors = [
"Lorenz Schmidt <[email protected]>",
"Gabriel Bathie <[email protected]>",
]
description = "A collection of dimensionality reduction techniques"
edition = "2018"
license = "MIT OR Apache-2.0"
Expand Down Expand Up @@ -41,6 +44,8 @@ rand = { version = "0.8", features = ["small_rng"] }

linfa = { version = "0.7.0", path = "../.." }
linfa-kernel = { version = "0.7.0", path = "../linfa-kernel" }
sprs = "0.11.1"
rand_xoshiro = "0.6.0"

[dev-dependencies]
ndarray-npy = { version = "0.8", default-features = false }
Expand All @@ -49,3 +54,5 @@ linfa-datasets = { version = "0.7.0", path = "../../datasets", features = [
"generate",
] }
approx = { version = "0.4" }
mnist = { version = "0.6.0", features = ["download"] }
linfa-trees = { version = "0.7.0", path = "../linfa-trees"}
4 changes: 4 additions & 0 deletions algorithms/linfa-reduction/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
`linfa-reduction` currently provides an implementation of the following dimensional reduction methods:
- Diffusion Mapping
- Principal Component Analysis (PCA)
- Gaussian random projections
- Sparse random projections

## Examples

Expand All @@ -19,6 +21,8 @@ There is an usage example in the `examples/` directory. To run, use:
```bash
$ cargo run --release --example diffusion_map
$ cargo run --release --example pca
$ cargo run --release --example gaussian_projection
$ cargo run --release --example sparse_projection
```

## BLAS/LAPACK backend
Expand Down
76 changes: 76 additions & 0 deletions algorithms/linfa-reduction/examples/gaussian_projection.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
use std::{error::Error, time::Instant};

use linfa::prelude::*;
use linfa_reduction::random_projection::GaussianRandomProjection;
use linfa_trees::{DecisionTree, SplitQuality};

use mnist::{MnistBuilder, NormalizedMnist};
use ndarray::{Array1, Array2};
use rand::SeedableRng;
use rand_xoshiro::Xoshiro256Plus;

/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
fn main() -> Result<(), Box<dyn Error>> {
// Parameters
let train_sz = 10_000usize;
let test_sz = 1_000usize;
let reduced_dim = 100;
let rng = Xoshiro256Plus::seed_from_u64(42);

let NormalizedMnist {
trn_img,
trn_lbl,
tst_img,
tst_lbl,
..
} = MnistBuilder::new()
.label_format_digit()
.training_set_length(train_sz as u32)
.test_set_length(test_sz as u32)
.download_and_extract()
.finalize()
.normalize();

let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?;
let train_labels: Array1<usize> =
Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize);
let train_dataset = Dataset::new(train_data, train_labels);

let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?;
let test_labels: Array1<usize> = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize);

let params = DecisionTree::params()
.split_quality(SplitQuality::Gini)
.max_depth(Some(10));

println!("Training non-reduced model...");
let start = Instant::now();
let model: DecisionTree<f32, usize> = params.fit(&train_dataset)?;

let end = start.elapsed();
let pred_y = model.predict(&test_data);
let cm = pred_y.confusion_matrix(&test_labels)?;
println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy());
println!("Training time: {:.2}s\n", end.as_secs_f32());

println!("Training reduced model...");
let start = Instant::now();
// Compute the random projection and train the model on the reduced dataset.
let proj = GaussianRandomProjection::<f32>::params_with_rng(rng)
.target_dim(reduced_dim)
.fit(&train_dataset)?;
let reduced_train_ds = proj.transform(&train_dataset);
let reduced_test_data = proj.transform(&test_data);
let model_reduced: DecisionTree<f32, usize> = params.fit(&reduced_train_ds)?;

let end = start.elapsed();
let pred_reduced = model_reduced.predict(&reduced_test_data);
let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?;
println!(
"Reduced model precision: {}%",
100.0 * cm_reduced.accuracy()
);
println!("Reduction + training time: {:.2}s", end.as_secs_f32());

Ok(())
}
76 changes: 76 additions & 0 deletions algorithms/linfa-reduction/examples/sparse_projection.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
use std::{error::Error, time::Instant};

use linfa::prelude::*;
use linfa_reduction::random_projection::SparseRandomProjection;
use linfa_trees::{DecisionTree, SplitQuality};

use mnist::{MnistBuilder, NormalizedMnist};
use ndarray::{Array1, Array2};
use rand::SeedableRng;
use rand_xoshiro::Xoshiro256Plus;

/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
fn main() -> Result<(), Box<dyn Error>> {
// Parameters
let train_sz = 10_000usize;
let test_sz = 1_000usize;
let reduced_dim = 100;
let rng = Xoshiro256Plus::seed_from_u64(42);

let NormalizedMnist {
trn_img,
trn_lbl,
tst_img,
tst_lbl,
..
} = MnistBuilder::new()
.label_format_digit()
.training_set_length(train_sz as u32)
.test_set_length(test_sz as u32)
.download_and_extract()
.finalize()
.normalize();

let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?;
let train_labels: Array1<usize> =
Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize);
let train_dataset = Dataset::new(train_data, train_labels);

let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?;
let test_labels: Array1<usize> = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize);

let params = DecisionTree::params()
.split_quality(SplitQuality::Gini)
.max_depth(Some(10));

println!("Training non-reduced model...");
let start = Instant::now();
let model: DecisionTree<f32, usize> = params.fit(&train_dataset)?;

let end = start.elapsed();
let pred_y = model.predict(&test_data);
let cm = pred_y.confusion_matrix(&test_labels)?;
println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy());
println!("Training time: {:.2}s\n", end.as_secs_f32());

println!("Training reduced model...");
let start = Instant::now();
// Compute the random projection and train the model on the reduced dataset.
let proj = SparseRandomProjection::<f32>::params_with_rng(rng)
.target_dim(reduced_dim)
.fit(&train_dataset)?;
let reduced_train_ds = proj.transform(&train_dataset);
let reduced_test_data = proj.transform(&test_data);
let model_reduced: DecisionTree<f32, usize> = params.fit(&reduced_train_ds)?;

let end = start.elapsed();
let pred_reduced = model_reduced.predict(&reduced_test_data);
let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?;
println!(
"Reduced model precision: {}%",
100.0 * cm_reduced.accuracy()
);
println!("Reduction + training time: {:.2}s", end.as_secs_f32());

Ok(())
}
8 changes: 8 additions & 0 deletions algorithms/linfa-reduction/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,12 @@ pub enum ReductionError {
LinalgError(#[from] linfa_linalg::LinalgError),
#[error(transparent)]
LinfaError(#[from] linfa::error::Error),
#[error(transparent)]
NdarrayRandError(#[from] ndarray_rand::rand_distr::NormalError),
#[error("Precision parameter must be in the interval (0; 1)")]
InvalidPrecision,
#[error("Target dimension of the projection must be positive")]
NonPositiveEmbeddingSize,
#[error("Target dimension {0} is larger than the number of features {1}.")]
DimensionIncrease(usize, usize),
}
1 change: 1 addition & 0 deletions algorithms/linfa-reduction/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ extern crate ndarray;
mod diffusion_map;
mod error;
mod pca;
pub mod random_projection;
pub mod utils;

pub use diffusion_map::{DiffusionMap, DiffusionMapParams, DiffusionMapValidParams};
Expand Down
38 changes: 38 additions & 0 deletions algorithms/linfa-reduction/src/random_projection/common.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/// Compute a safe dimension for a projection with precision `eps`,
/// using the Johnson-Lindestrauss Lemma.
///
/// References:
/// - [D. Achlioptas, JCSS](https://www.sciencedirect.com/science/article/pii/S0022000003000254)
/// - [Li et al., SIGKDD'06](https://hastie.su.domains/Papers/Ping/KDD06_rp.pdf)
pub(crate) fn johnson_lindenstrauss_min_dim(n_samples: usize, eps: f64) -> usize {
let log_samples = (n_samples as f64).ln();
let value = 4. * log_samples / (eps.powi(2) / 2. - eps.powi(3) / 3.);
value as usize
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
/// Test against values computed by the scikit-learn implementation
/// of `johnson_lindenstrauss_min_dim`.
fn test_johnson_lindenstrauss() {
assert_eq!(johnson_lindenstrauss_min_dim(100, 0.05), 15244);
assert_eq!(johnson_lindenstrauss_min_dim(100, 0.1), 3947);
assert_eq!(johnson_lindenstrauss_min_dim(100, 0.2), 1062);
assert_eq!(johnson_lindenstrauss_min_dim(100, 0.5), 221);
assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.05), 22867);
assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.1), 5920);
assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.2), 1594);
assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.5), 331);
assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.05), 28194);
assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.1), 7300);
assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.2), 1965);
assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.5), 408);
assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.05), 30489);
assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.1), 7894);
assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.2), 2125);
assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.5), 442);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
use linfa::{prelude::Records, traits::Fit, Float};
use ndarray::{Array, Array2, Ix2};
use ndarray_rand::{
rand_distr::{Normal, StandardNormal},
RandomExt,
};
use rand::{prelude::Distribution, Rng, SeedableRng};
use rand_xoshiro::Xoshiro256Plus;

use super::super::common::johnson_lindenstrauss_min_dim;
use super::hyperparams::GaussianRandomProjectionParamsInner;
use super::{GaussianRandomProjectionParams, GaussianRandomProjectionValidParams};
use crate::{impl_proj, ReductionError};

/// Embedding via Gaussian random projection
pub struct GaussianRandomProjection<F: Float> {
projection: Array2<F>,
}

impl<F, Rec, T, R> Fit<Rec, T, ReductionError> for GaussianRandomProjectionValidParams<R>
where
F: Float,
Rec: Records<Elem = F>,
R: Rng + Clone,
StandardNormal: Distribution<F>,
{
type Object = GaussianRandomProjection<F>;

fn fit(&self, dataset: &linfa::DatasetBase<Rec, T>) -> Result<Self::Object, ReductionError> {
let n_samples = dataset.nsamples();
let n_features = dataset.nfeatures();
let mut rng = self.rng.clone();

let n_dims = match &self.params {
GaussianRandomProjectionParamsInner::Dimension { target_dim } => *target_dim,
GaussianRandomProjectionParamsInner::Epsilon { eps } => {
johnson_lindenstrauss_min_dim(n_samples, *eps)
}
};

if n_dims > n_features {
return Err(ReductionError::DimensionIncrease(n_dims, n_features));
}

let std_dev = F::cast(n_features).sqrt().recip();
let gaussian = Normal::new(F::zero(), std_dev)?;

let proj = Array::random_using((n_features, n_dims), gaussian, &mut rng);

Ok(GaussianRandomProjection { projection: proj })
}
}

impl<F: Float> GaussianRandomProjection<F> {
/// Create new parameters for a [`GaussianRandomProjection`] with default value
/// `eps = 0.1` and a [`Xoshiro256Plus`] RNG.
pub fn params() -> GaussianRandomProjectionParams<Xoshiro256Plus> {
GaussianRandomProjectionParams(GaussianRandomProjectionValidParams {
params: GaussianRandomProjectionParamsInner::Epsilon { eps: 0.1 },
rng: Xoshiro256Plus::seed_from_u64(42),
})
}

/// Create new parameters for a [`GaussianRandomProjection`] with default values
/// `eps = 0.1` and the provided [`Rng`].
pub fn params_with_rng<R>(rng: R) -> GaussianRandomProjectionParams<R>
where
R: Rng + Clone,
{
GaussianRandomProjectionParams(GaussianRandomProjectionValidParams {
params: GaussianRandomProjectionParamsInner::Epsilon { eps: 0.1 },
rng,
})
}
}

impl_proj! {GaussianRandomProjection<F>}
Loading
Loading